aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2020-10-21 00:04:14 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2020-11-03 15:10:47 +0000
commitebcebf1dee7f8314976b1e0cabd62b4cf893d765 (patch)
tree95d3e691a0e88a3e213a1d30446a9224497f2055 /src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
parentda4b1b2055d96aaf73704eb9b0b82d74dc2d699c (diff)
downloadComputeLibrary-ebcebf1dee7f8314976b1e0cabd62b4cf893d765.tar.gz
COMPMID-3638: Move NEON kernels
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Change-Id: Ieed3e4bc8be7fef80c90c5094599b477a56fc473 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4285 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp')
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp75
1 files changed, 48 insertions, 27 deletions
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 36357dde41..9050427b34 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -34,12 +34,23 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
#include "support/MemorySupport.h"
namespace arm_compute
{
using namespace arm_compute::misc::shape_calculator;
+NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
+
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
: _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
@@ -80,7 +91,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
_memory_group.manage(&_signed_a);
- _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+ _convert_to_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+ _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
a_to_use = &_signed_a;
_a_offset = _signed_a.info()->quantization_info().uniform().offset;
@@ -153,10 +165,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
}
// Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
+ _mtx_a_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
// Configure transpose kernel
- _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ _mtx_b_reshape_kernel->configure(b, &_tmp_b);
}
if(!_fused_assembly_path)
@@ -176,7 +190,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
}
// Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
+ _mtx_b_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixBReductionKernel>();
+ _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -188,7 +203,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_memory_group.manage(&_vector_sum_row);
// Configure matrix A reduction kernel
- _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
+ _mtx_a_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+ _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
}
if(_fuse_output_stage)
@@ -196,19 +212,22 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
// Configure matrix multiply kernel
if(!_assembly_path)
{
- _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
+ _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
}
- _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
- _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
- _flip_signedness ? &_signed_output : output,
- a->info()->dimension(0),
- _a_offset, _b_offset, info.gemmlowp_output_stage());
+ _offset_contribution_output_stage_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
+ _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
+ _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : output,
+ a->info()->dimension(0),
+ _a_offset, _b_offset, info.gemmlowp_output_stage());
if(_flip_signedness)
{
- _convert_from_signed_asymm.configure(&_signed_output, output);
+ _convert_from_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+ _convert_from_signed_asymm->configure(&_signed_output, output);
}
}
else
@@ -216,10 +235,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
// Configure matrix multiply kernel
if(!_assembly_path)
{
- _mm_kernel.configure(matrix_a, matrix_b, output);
+ _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ _mm_kernel->configure(matrix_a, matrix_b, output);
}
// Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
+ _offset_contribution_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionKernel>();
+ _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
}
// Configure activation
@@ -468,7 +489,7 @@ void NEGEMMLowpMatrixMultiplyCore::run()
// Convert QASYMM8->QASYMM8_SIGNED
if(_flip_signedness)
{
- NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+ NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
}
// Run GEMM
@@ -481,15 +502,15 @@ void NEGEMMLowpMatrixMultiplyCore::run()
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
- NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
}
- NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
}
if(!_fused_assembly_path)
@@ -497,31 +518,31 @@ void NEGEMMLowpMatrixMultiplyCore::run()
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
{
- NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+ NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
}
if(_fuse_output_stage)
{
// Run offset contribution kernel
- NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+ NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
}
else
{
// Run offset contribution kernel
- NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+ NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
}
}
// Convert QASYMM8_SIGNED->QASYMM8
- if(_flip_signedness)
+ if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
{
- NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+ NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
}
// Run fused activation unless already run in the fused assembly
@@ -560,7 +581,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
// Run reshape kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
- NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
if(!original_b_managed_by_weights_manager)
{
_original_b->mark_as_unused();
@@ -571,7 +592,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
{
_vector_sum_col.allocator()->allocate();
- NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
}
_is_prepared = true;