From d7316eb877cc4ff8573219374335e917b19a0203 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 16 Jun 2021 11:14:41 +0100
Subject: Port NEGEMMConv2d to memory injecting interface

Resolves: COMPMID-4506, COMPMID-4570

Change-Id: I6d37a06da141f1fcfcaa8525322a319cb0234791
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5824
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/NEON/functions/NEGEMM.h        | 15 +++---
 arm_compute/runtime/NEON/functions/NEGEMMConv2d.h  |  5 +-
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  | 63 ++--------------------
 .../NEON/functions/NEWinogradConvolutionLayer.h    | 15 +-----
 4 files changed, 12 insertions(+), 86 deletions(-)

(limited to 'arm_compute/runtime/NEON')
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 6fa30bd545..6c5be0eb5e 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -32,6 +32,7 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
 
 #include <memory>
 
@@ -105,14 +106,7 @@ public:
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
-     * @param[in]  a         First input tensor info  (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32
-     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor info. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     * Similar to @ref NEGEMM::configure()
      *
      * @return a status
      */
@@ -146,7 +140,10 @@ private:
     bool           _reshape_b_only_on_first_run;
     bool           _is_prepared;
 
-    ITensorPack _asm_glue_tensors{};
+    ITensorPack                      _asm_glue_run_pack;
+    ITensorPack                      _asm_glue_prep_pack;
+    WorkspaceData<Tensor>            _asm_glue_workspace;
+    experimental::MemoryRequirements _aux_mem_req;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index f39ce4dfa3..53ceb6d978 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -29,15 +29,12 @@
 #include "arm_compute/runtime/IMemoryManager.h"
 
 #include <memory>
+
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
 class ITensorInfo;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index dc9783f9eb..ff888760e1 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -24,32 +24,15 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "NEActivationLayer.h"
-#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
-class NEConvertQuantizedSignednessKernel;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-class NEGEMMLowpOffsetContributionKernel;
-class NEGEMMLowpOffsetContributionOutputStageKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-class NEGEMMLowpMatrixBReductionKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
-
 /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
  *
  *  -# @ref NEGEMMInterleave4x4Kernel
@@ -119,14 +102,7 @@ public:
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
-     *
-     * @param[in] a         First input tensor info  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[in] output    Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                      if the reshape of matrix B should be executed only for the first run
+     * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
@@ -137,41 +113,8 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                                                    _memory_group;
-    IWeightsManager                                               *_weights_manager;
-    std::unique_ptr<cpu::CpuGemmAssemblyDispatch>                  _asm_glue;
-    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                _mm_kernel;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>                      _mtx_b_reshape_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-    NEActivationLayer                                              _activation_func;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
-
-    Tensor         _vector_sum_col;
-    Tensor         _vector_sum_row;
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _mm_result_s32;
-    Tensor         _signed_a;
-    Tensor         _signed_output;
-    const ITensor *_original_b;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-
-    bool _run_vector_matrix_multiplication;
-    bool _assembly_path;
-    bool _fused_assembly_path;
-    bool _reshape_b_only_on_first_run;
-    bool _is_prepared;
-    bool _fuse_output_stage;
-    bool _run_activation;
-    bool _flip_signedness;
-
-    ITensorPack _asm_glue_tensors{};
+    struct Impl;
+    std::unique_ptr<struct Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index f9ebf608cb..b02c4ed5b7 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -96,20 +96,9 @@ public:
     void run() override;
     void prepare() override;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
+    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer
      *
-     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     *                             Currently only 3x3 and 5x5 kernels are supported.
-     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
+     * Similar to @ref NEWinogradConvolutionLayer::configure()
      *
      * @return a status
      */
-- 
cgit v1.2.1