From ebcebf1dee7f8314976b1e0cabd62b4cf893d765 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Wed, 21 Oct 2020 00:04:14 +0100
Subject: COMPMID-3638: Move NEON kernels

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: Ieed3e4bc8be7fef80c90c5094599b477a56fc473
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4285
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/NEON/functions/NEQLSTMLayer.h | 186 ++++++++++------------
 1 file changed, 87 insertions(+), 99 deletions(-)

(limited to 'arm_compute/runtime/NEON/functions/NEQLSTMLayer.h')
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 7c572de874..17ad5a354b 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -24,24 +24,27 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYER_H
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
+#include "support/MemorySupport.h"
 
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+class NEQLSTMLayerNormalizationKernel;
+class NEGEMMLowpMatrixAReductionKernel;
 
 /** Basic function to run @ref NEQLSTMLayer
  *
@@ -70,6 +73,8 @@ public:
     NEQLSTMLayer &operator=(const NEQLSTMLayer &) = delete;
     /** Default move assignment operator */
     NEQLSTMLayer &operator=(NEQLSTMLayer &&) = default;
+    /** Default destructor */
+    ~NEQLSTMLayer();
     /** Initialize function's tensors.
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
@@ -204,7 +209,7 @@ private:
                       Tensor *outstage_res, float gemmlowp_scale,
                       const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
 
-    MemoryGroup _memory_group{};
+    MemoryGroup _memory_group;
 
     /** A small internel kernel do the copy between two tensors */
     class TensorCopyKernel
@@ -217,6 +222,8 @@ private:
         Window   _window{};
 
     public:
+        /** Destructor */
+        ~TensorCopyKernel();
         /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer::TensorCopyKernel
          *
          * @param[in] src Source tensor info.
@@ -236,79 +243,79 @@ private:
     };
 
     // Functions used
-    NETranspose                      _transpose_input_to_forget_weights{};
-    NETranspose                      _transpose_input_to_cell_weights{};
-    NETranspose                      _transpose_input_to_output_weights{};
-    NETranspose                      _transpose_input_to_input_weights{};
-    NETranspose                      _transpose_recurrent_to_forget_weights{};
-    NETranspose                      _transpose_recurrent_to_cell_weights{};
-    NETranspose                      _transpose_recurrent_to_output_weights{};
-    NETranspose                      _transpose_recurrent_to_input_weights{};
-    NETranspose                      _transpose_projection_weights{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    NEArithmeticAddition             _projection_bias_add{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_forget{};
-    NEGEMMLowpOutputStage            _input_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_forget{};
-    NEArithmeticAddition             _accumulate_cell_forget{};
-    NEActivationLayer                _forget_gate_sigmoid{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
-    NEGEMMLowpOutputStage            _input_to_cell_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
-    NEGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_modulation{};
-    NEActivationLayer                _cell_gate_tanh{};
-    NEArithmeticSubtraction          _input_gate_sub{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
-    NEGEMMLowpOutputStage            _input_to_input_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
-    NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_input{};
-    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_input{};
-    NEGEMMLowpOutputStage            _cell_to_input_outstage{};
-    NEArithmeticAddition             _accumulate_cell_input{};
-    NEActivationLayer                _input_gate_sigmoid{};
-    NEPixelWiseMultiplication        _pixelwise_mul_forget_cell{};
-    NEPixelWiseMultiplication        _pixelwise_mul_input_cell{};
-    NEArithmeticAddition             _add_forget_cell{};
-    NEActivationLayer                _cell_clip{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
-    NEGEMMLowpOutputStage            _input_to_output_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
-    NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_output{};
-    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_output{};
-    NEGEMMLowpOutputStage            _cell_to_output_outstage{};
-    NEArithmeticAddition             _accumulate_cell_to_output{};
-    NEActivationLayer                _output_gate_sigmoid{};
-    NEActivationLayer                _hidden_tanh{};
-    NEPixelWiseMultiplication        _pixelwise_mul_hidden{};
-    NEGEMMLowpOutputStage            _hidden_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
-    NEGEMMLowpOutputStage            _projection_outstage{};
-    NEArithmeticAddition             _accumulate_projection{};
-    NEActivationLayer                _projection_clip{};
+    NETranspose                                       _transpose_input_to_forget_weights;
+    NETranspose                                       _transpose_input_to_cell_weights;
+    NETranspose                                       _transpose_input_to_output_weights;
+    NETranspose                                       _transpose_input_to_input_weights;
+    NETranspose                                       _transpose_recurrent_to_forget_weights;
+    NETranspose                                       _transpose_recurrent_to_cell_weights;
+    NETranspose                                       _transpose_recurrent_to_output_weights;
+    NETranspose                                       _transpose_recurrent_to_input_weights;
+    NETranspose                                       _transpose_projection_weights;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction;
+    NEArithmeticAddition                              _projection_bias_add;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_forget;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_forget;
+    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_forget;
+    NEGEMMLowpOutputStage                             _input_to_forget_outstage;
+    NEGEMMLowpOutputStage                             _recurrent_to_forget_outstage;
+    NEGEMMLowpOutputStage                             _cell_to_forget_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_forget;
+    NEArithmeticAddition                              _accumulate_cell_forget;
+    NEActivationLayer                                 _forget_gate_sigmoid;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_cell;
+    NEGEMMLowpOutputStage                             _input_to_cell_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_cell;
+    NEGEMMLowpOutputStage                             _recurrent_to_cell_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_modulation;
+    NEActivationLayer                                 _cell_gate_tanh;
+    NEArithmeticSubtraction                           _input_gate_sub;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_input;
+    NEGEMMLowpOutputStage                             _input_to_input_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_input;
+    NEGEMMLowpOutputStage                             _recurrent_to_input_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_input;
+    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_input;
+    NEGEMMLowpOutputStage                             _cell_to_input_outstage;
+    NEArithmeticAddition                              _accumulate_cell_input;
+    NEActivationLayer                                 _input_gate_sigmoid;
+    NEPixelWiseMultiplication                         _pixelwise_mul_forget_cell;
+    NEPixelWiseMultiplication                         _pixelwise_mul_input_cell;
+    NEArithmeticAddition                              _add_forget_cell;
+    NEActivationLayer                                 _cell_clip;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_output;
+    NEGEMMLowpOutputStage                             _input_to_output_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_output;
+    NEGEMMLowpOutputStage                             _recurrent_to_output_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_output;
+    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_output;
+    NEGEMMLowpOutputStage                             _cell_to_output_outstage;
+    NEArithmeticAddition                              _accumulate_cell_to_output;
+    NEActivationLayer                                 _output_gate_sigmoid;
+    NEActivationLayer                                 _hidden_tanh;
+    NEPixelWiseMultiplication                         _pixelwise_mul_hidden;
+    NEGEMMLowpOutputStage                             _hidden_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_projection;
+    NEGEMMLowpOutputStage                             _projection_outstage;
+    NEArithmeticAddition                              _accumulate_projection;
+    NEActivationLayer                                 _projection_clip;
 
-    TensorCopyKernel _projection_bias_copy{};
-    TensorCopyKernel _projection_output_to_accumulate_copy{};
-    TensorCopyKernel _projection_accumulate_to_output_copy{};
-    TensorCopyKernel _hidden_to_output_copy{};
+    TensorCopyKernel _projection_bias_copy;
+    TensorCopyKernel _projection_output_to_accumulate_copy;
+    TensorCopyKernel _projection_accumulate_to_output_copy;
+    TensorCopyKernel _hidden_to_output_copy;
 
-    std::array<NEQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
+    std::array<std::unique_ptr<NEQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
 
-    NECopyKernel _copy_output{};
+    NECopy _copy_output;
 
     // Tensor pointers
     const ITensor *_input_to_input_weights
@@ -324,8 +331,8 @@ private:
     const ITensor *_recurrent_to_cell_weights{ nullptr };
     const ITensor *_recurrent_to_output_weights{ nullptr };
     const ITensor *_projection_weights{ nullptr };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -353,32 +360,13 @@ private:
         return _layer_norm_bias[getGateIndex(g)];
     }
 
-    inline NEQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
+    inline std::unique_ptr<NEQLSTMLayerNormalizationKernel> &get_layer_norm(LayerNormGate g)
     {
         return _layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ITensor *in)
-    {
-        ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
-
-        Tensor &out = get_layer_norm_output(g);
-        _memory_group.manage(&out);
-        out.allocator()->init(*(in->info()));
-
-        get_layer_norm(g).configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
-    }
-
-    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
-    {
-        // Output quantization scale will be different, but ignored here
-        // since it will be configured at configure() stage.
-        const TensorInfo out
-        {
-            in
-        };
-        return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
-    }
+    void configure_layer_norm(LayerNormGate g, const ITensor *in);
+    static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
     Tensor _input_to_forget_weights_transposed{ nullptr };
@@ -434,7 +422,7 @@ private:
     Tensor _projection_out_res{ nullptr };
     Tensor _projection_accumulate_res{ nullptr };
     Tensor _ones{ nullptr };
-    std::array<Tensor, _layer_norm_count> _layer_norm_output{ {} };
+    std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
     {
-- 
cgit v1.2.1