From b3be45759bdd0749ae3a16fe470820f0d9830ea9 Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Tue, 18 May 2021 10:46:00 +0100
Subject: Implement memory injection in CpuDirectGemmConv2d

The following operators are now stateless by implementing
memory injection.

- CpuDirectGemmConv2d
- CpuGemmAssemblyDispatch

A test case is added to test if CpuDirectGemmConv2d can
run on different group of tensors with a single configure.

Resolves: COMPMID-4506

Change-Id: I48f44ed41236ca7e18da2de07bdbacc9007a3c5e
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5718
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
---
 src/runtime/NEON/functions/NEGEMM.cpp              | 19 ++++++++++---
 src/runtime/NEON/functions/NEGEMMConv2d.cpp        | 21 ++++++++++++---
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     | 31 +++++++++++++++-------
 3 files changed, 55 insertions(+), 16 deletions(-)

(limited to 'src/runtime/NEON')
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 7318c3e492..b526874790 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -38,6 +38,7 @@
 #include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include <cmath>
@@ -46,6 +47,14 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
+using WorkspaceDataType = WorkspaceData<Tensor>;
+
+struct NEGEMM::AsmGlueTensors
+{
+    ITensorPack       tensors{};
+    WorkspaceDataType ws{};
+};
+
 namespace
 {
 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
@@ -63,7 +72,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()), _ma_kernel(),
       _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
-      _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+      _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _asm_glue_tensors(std::make_unique<AsmGlueTensors>())
 {
 }
 
@@ -94,7 +103,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
         _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
 
-        _asm_glue_tensors =
+        _asm_glue_tensors->tensors =
         {
             { ACL_SRC_0, a },
             { ACL_SRC_1, b },
@@ -102,6 +111,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
             { ACL_DST, d },
         };
 
+        _asm_glue_tensors->ws = manage_workspace<Tensor>(_asm_glue->workspace(), _memory_group, _asm_glue_tensors->tensors);
+
         // Scale product by alpha
         if(_run_alpha_scale)
         {
@@ -323,7 +334,7 @@ void NEGEMM::run()
 
     if(_asm_glue->is_configured())
     {
-        _asm_glue->run(_asm_glue_tensors);
+        _asm_glue->run(_asm_glue_tensors->tensors);
         if(_run_alpha_scale)
         {
             _alpha_scale_func.run();
@@ -377,7 +388,7 @@ void NEGEMM::prepare()
                 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
             }
 
-            _asm_glue->prepare(_asm_glue_tensors);
+            _asm_glue->prepare(_asm_glue_tensors->tensors);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 94ceb6d27c..790543a34a 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -26,24 +26,37 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
 
 #include <set>
 
 namespace arm_compute
 {
-using OperatorType = cpu::CpuGemmDirectConv2d;
+using OperatorType      = cpu::CpuGemmDirectConv2d;
+using WorkspaceDataType = WorkspaceData<Tensor>;
 
 struct NEGEMMConv2d::Impl
 {
     ITensorPack                   tensors{};
+    MemoryGroup                   mg{};
     std::unique_ptr<OperatorType> op{ nullptr };
+    WorkspaceDataType             ws{};
+
+    void allocate_and_add_workspace()
+    {
+        if(op)
+        {
+            ws = manage_workspace<Tensor>(op->workspace(), mg, tensors);
+        }
+    }
 };
 
 NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _impl(std::make_unique<Impl>())
 {
-    _impl->op = std::make_unique<OperatorType>(memory_manager);
+    _impl->op = std::make_unique<OperatorType>();
+    _impl->mg = MemoryGroup(memory_manager);
 }
 
 NEGEMMConv2d::~NEGEMMConv2d() = default;
@@ -55,7 +68,9 @@ void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITens
     _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases);
     _impl->tensors.add_tensor(TensorType::ACL_DST, output);
 
-    _impl->op->configure(input->info(), weights->info(), biases->info(), output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), ((biases) ? biases->info() : nullptr), output->info(), info);
+
+    _impl->allocate_and_add_workspace();
 }
 
 Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index cc0f20e695..d42e656e0c 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,10 +42,17 @@
 #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 namespace arm_compute
 {
+using WorkspaceDataType = WorkspaceData<Tensor>;
+struct NEGEMMLowpMatrixMultiplyCore::AsmGlueTensors
+{
+    ITensorPack       tensors{};
+    WorkspaceDataType ws{};
+};
 namespace
 {
 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
@@ -66,11 +73,11 @@ using namespace arm_compute::misc::shape_calculator;
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
       _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(),
       _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
       _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
-      _run_activation(false), _flip_signedness(false)
+      _run_activation(false), _flip_signedness(false), _asm_glue_tensors(std::make_unique<AsmGlueTensors>())
 {
 }
 
@@ -149,18 +156,24 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
                 auto c_info_to_use = c == nullptr ? nullptr : c->info();
                 _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
                 _fused_assembly_path = _asm_glue->is_configured();
-                _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
-                _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output);
+                _asm_glue_tensors->tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
+                _asm_glue_tensors->tensors.add_tensor(TensorType::ACL_DST, output);
             }
             else
             {
                 auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output);
                 _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
-                _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
+                _asm_glue_tensors->tensors.add_tensor(TensorType::ACL_DST, output_to_use);
             }
             _assembly_path = _asm_glue->is_configured();
-            _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
-            _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+            _asm_glue_tensors->tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+            _asm_glue_tensors->tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+
+            if(_assembly_path)
+            {
+                _asm_glue_tensors->ws = manage_workspace<Tensor>(_asm_glue->workspace(), _memory_group, _asm_glue_tensors->tensors);
+            }
+
             break;
         }
         default:
@@ -520,7 +533,7 @@ void NEGEMMLowpMatrixMultiplyCore::run()
     // Run GEMM
     if(_asm_glue->is_configured())
     {
-        _asm_glue->run(_asm_glue_tensors);
+        _asm_glue->run(_asm_glue_tensors->tensors);
     }
     else
     {
@@ -590,7 +603,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
                 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
             }
 
-            _asm_glue->prepare(_asm_glue_tensors);
+            _asm_glue->prepare(_asm_glue_tensors->tensors);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
-- 
cgit v1.2.1