COMPMID-3638: Move NEON kernels

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Change-Id: Ieed3e4bc8be7fef80c90c5094599b477a56fc473 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4285 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Michalis Spyrou <michalis.spyrou@arm.com> 2020-10-21 00:04:14 +0100
committer: Michalis Spyrou <michalis.spyrou@arm.com> 2020-11-03 15:10:47 +0000
commit: ebcebf1dee7f8314976b1e0cabd62b4cf893d765 (patch)
tree: 95d3e691a0e88a3e213a1d30446a9224497f2055 /src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
parent: da4b1b2055d96aaf73704eb9b0b82d74dc2d699c (diff)
download: ComputeLibrary-ebcebf1dee7f8314976b1e0cabd62b4cf893d765.tar.gz
1 files changed, 48 insertions, 27 deletions
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 36357dde41..9050427b34 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -34,12 +34,23 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
+NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
+
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
@@ -80,7 +91,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
 
         _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
         _memory_group.manage(&_signed_a);
-        _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+        _convert_to_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+        _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
         a_to_use  = &_signed_a;
         _a_offset = _signed_a.info()->quantization_info().uniform().offset;
 
@@ -153,10 +165,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         }
 
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
+        _mtx_a_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+        _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+        _mtx_b_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+        _mtx_b_reshape_kernel->configure(b, &_tmp_b);
     }
 
     if(!_fused_assembly_path)
@@ -176,7 +190,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
+            _mtx_b_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixBReductionKernel>();
+            _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -188,7 +203,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             _memory_group.manage(&_vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
+            _mtx_a_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+            _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
         if(_fuse_output_stage)
@@ -196,19 +212,22 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             // Configure matrix multiply kernel
             if(!_assembly_path)
             {
-                _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
+                _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
-                                                               _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                               _flip_signedness ? &_signed_output : output,
-                                                               a->info()->dimension(0),
-                                                               _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
+                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+                                                                _flip_signedness ? &_signed_output : output,
+                                                                a->info()->dimension(0),
+                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
 
             if(_flip_signedness)
             {
-                _convert_from_signed_asymm.configure(&_signed_output, output);
+                _convert_from_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+                _convert_from_signed_asymm->configure(&_signed_output, output);
             }
         }
         else
@@ -216,10 +235,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             // Configure matrix multiply kernel
             if(!_assembly_path)
             {
-                _mm_kernel.configure(matrix_a, matrix_b, output);
+                _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, output);
             }
             // Configure offset contribution kernel
-            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
+            _offset_contribution_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionKernel>();
+            _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
         }
 
         // Configure activation
@@ -468,7 +489,7 @@ void NEGEMMLowpMatrixMultiplyCore::run()
     // Convert QASYMM8->QASYMM8_SIGNED
     if(_flip_signedness)
     {
-        NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+        NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
     }
 
     // Run GEMM
@@ -481,15 +502,15 @@ void NEGEMMLowpMatrixMultiplyCore::run()
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+                NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
             }
         }
-        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
     }
 
     if(!_fused_assembly_path)
@@ -497,31 +518,31 @@ void NEGEMMLowpMatrixMultiplyCore::run()
         // Run matrix A reduction kernel only if _b_offset is not equal to 0
         if(_b_offset != 0)
         {
-            NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
         if(_a_offset != 0 && !_reshape_b_only_on_first_run)
         {
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
         if(_fuse_output_stage)
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+            NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
         }
         else
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+            NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(_flip_signedness)
+    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
     {
-        NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+        NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
     }
 
     // Run fused activation unless already run in the fused assembly
@@ -560,7 +581,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
 
             // Run reshape kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
@@ -571,7 +592,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
         if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
         _is_prepared = true;
author	Michalis Spyrou <michalis.spyrou@arm.com>	2020-10-21 00:04:14 +0100
committer	Michalis Spyrou <michalis.spyrou@arm.com>	2020-11-03 15:10:47 +0000
commit	ebcebf1dee7f8314976b1e0cabd62b4cf893d765 (patch)
tree	95d3e691a0e88a3e213a1d30446a9224497f2055 /src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
parent	da4b1b2055d96aaf73704eb9b0b82d74dc2d699c (diff)
download	ComputeLibrary-ebcebf1dee7f8314976b1e0cabd62b4cf893d765.tar.gz