1 files changed, 239 insertions, 202 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index d1cc962940..009a4e0911 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,46 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYER_H
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
-
+class ITensorInfo;
+class NEQLSTMLayerNormalizationKernel;
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace cpu
 /** Basic function to run @ref NEQLSTMLayer
  *
- * This function calls the following NEON functions/kernels:
+ * This function calls the following kernels:
  *
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref NEArithmeticAdditionKernel                            Elementwise addition
- * -# @ref NEArithmeticSubtractionKernel                         Elementwise subtraction
- * -# @ref NECopyKernel                                          Copy kernel for copying output_state_out to output
+ * -# @ref NEArithmeticAddition                                  Elementwise addition
+ * -# @ref NEArithmeticSubtraction                               Elementwise subtraction
+ * -# @ref NECopy                                                Copy kernel for copying output_state_out to output
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref NEGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref NEPixelWiseMultiplicationKernel                       Elementwise multiplication
+ * -# @ref NEGEMMLowpOutputStage                                 Convert 32-bit integers into QSYMM16
+ * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel            For precomputing effective biases to use
+ * -# @ref NEPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref NETranspose                                           Transpose function for reshaping the weights
  * */
 class NEQLSTMLayer : public IFunction
@@ -64,14 +73,24 @@ public:
     NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEQLSTMLayer(const NEQLSTMLayer &) = delete;
-    /** Default move constructor */
-    NEQLSTMLayer(NEQLSTMLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEQLSTMLayer(NEQLSTMLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEQLSTMLayer &operator=(const NEQLSTMLayer &) = delete;
-    /** Default move assignment operator */
-    NEQLSTMLayer &operator=(NEQLSTMLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEQLSTMLayer &operator=(NEQLSTMLayer &&) = delete;
+    /** Default destructor */
+    ~NEQLSTMLayer();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0          |src1 - src6  |src7 -src9   |src10  |src11         |dst0   |dst1 - dst2       |
+     * |:-------------|:------------|:------------|:------|:-------------|:------|:-----------------|
+     * |QASYMM8_SIGNED|QASYMM8      |S32          |QSYMM16|QASYMM8_SIGNED|QSYMM16|QASYMM8_SIGNED    |
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
      * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
@@ -111,12 +130,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *output_state_in,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output_state_out,
+                   ITensor                   *output,
                    const LSTMParams<ITensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer
@@ -161,12 +189,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -199,24 +236,33 @@ private:
      * @param[in] mm_res_info    Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res,
-                      Tensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                      NEGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ITensor                *mm_input,
+                      const ITensor                *mm_weights,
+                      const ITensor                *bias,
+                      Tensor                       *mm_res,
+                      Tensor                       *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
-    MemoryGroup _memory_group{};
+    MemoryGroup _memory_group;
 
     /** A small internel kernel do the copy between two tensors */
     class TensorCopyKernel
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ITensor *_src{ nullptr };
-        ITensor *_dst{ nullptr };
+        ITensor *_src{nullptr};
+        ITensor *_dst{nullptr};
         size_t   _row_size{};
         Window   _window{};
 
     public:
+        /** Destructor */
+        ~TensorCopyKernel();
         /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer::TensorCopyKernel
          *
          * @param[in] src Source tensor info.
@@ -236,93 +282,96 @@ private:
     };
 
     // Functions used
-    NETranspose                      _transpose_input_to_forget_weights{};
-    NETranspose                      _transpose_input_to_cell_weights{};
-    NETranspose                      _transpose_input_to_output_weights{};
-    NETranspose                      _transpose_input_to_input_weights{};
-    NETranspose                      _transpose_recurrent_to_forget_weights{};
-    NETranspose                      _transpose_recurrent_to_cell_weights{};
-    NETranspose                      _transpose_recurrent_to_output_weights{};
-    NETranspose                      _transpose_recurrent_to_input_weights{};
-    NETranspose                      _transpose_projection_weights{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    NEArithmeticAdditionKernel       _projection_bias_add{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_forget{};
-    NEGEMMLowpOutputStage            _input_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_forget{};
-    NEArithmeticAdditionKernel       _accumulate_cell_forget{};
-    NEActivationLayer                _forget_gate_sigmoid{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
-    NEGEMMLowpOutputStage            _input_to_cell_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
-    NEGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_modulation{};
-    NEActivationLayer                _cell_gate_tanh{};
-    NEArithmeticSubtractionKernel    _input_gate_sub{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
-    NEGEMMLowpOutputStage            _input_to_input_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
-    NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_input{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_input{};
-    NEGEMMLowpOutputStage            _cell_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_input{};
-    NEActivationLayer                _input_gate_sigmoid{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_forget_cell{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_input_cell{};
-    NEArithmeticAdditionKernel       _add_forget_cell{};
-    NEActivationLayer                _cell_clip{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
-    NEGEMMLowpOutputStage            _input_to_output_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
-    NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_output{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_output{};
-    NEGEMMLowpOutputStage            _cell_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_to_output{};
-    NEActivationLayer                _output_gate_sigmoid{};
-    NEActivationLayer                _hidden_tanh{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_hidden{};
-    NEGEMMLowpOutputStage            _hidden_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
-    NEGEMMLowpOutputStage            _projection_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_projection{};
-    NEActivationLayer                _projection_clip{};
 
-    TensorCopyKernel _projection_bias_copy{};
-    TensorCopyKernel _projection_output_to_accumulate_copy{};
-    TensorCopyKernel _projection_accumulate_to_output_copy{};
-    TensorCopyKernel _hidden_to_output_copy{};
+    NEDequantizationLayer                                            _dequantize_input_to_forget_weights;
+    NEQuantizationLayer                                              _quantize_input_to_forget_weights;
+    NETranspose                                                      _transpose_input_to_forget_weights;
+    NETranspose                                                      _transpose_input_to_cell_weights;
+    NETranspose                                                      _transpose_input_to_output_weights;
+    NETranspose                                                      _transpose_input_to_input_weights;
+    NETranspose                                                      _transpose_recurrent_to_forget_weights;
+    NETranspose                                                      _transpose_recurrent_to_cell_weights;
+    NETranspose                                                      _transpose_recurrent_to_output_weights;
+    NETranspose                                                      _transpose_recurrent_to_input_weights;
+    NETranspose                                                      _transpose_projection_weights;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction;
+    NEArithmeticAddition                                             _projection_bias_add;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_forget;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_forget;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_forget;
+    NEGEMMLowpOutputStage                                            _input_to_forget_outstage;
+    NEGEMMLowpOutputStage                                            _recurrent_to_forget_outstage;
+    NEGEMMLowpOutputStage                                            _cell_to_forget_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_forget;
+    NEArithmeticAddition                                             _accumulate_cell_forget;
+    NEActivationLayer                                                _forget_gate_sigmoid;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_cell;
+    NEGEMMLowpOutputStage                                            _input_to_cell_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_cell;
+    NEGEMMLowpOutputStage                                            _recurrent_to_cell_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_modulation;
+    NEActivationLayer                                                _cell_gate_tanh;
+    NEArithmeticSubtraction                                          _input_gate_sub;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_input;
+    NEGEMMLowpOutputStage                                            _input_to_input_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_input;
+    NEGEMMLowpOutputStage                                            _recurrent_to_input_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_input;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_input;
+    NEGEMMLowpOutputStage                                            _cell_to_input_outstage;
+    NEArithmeticAddition                                             _accumulate_cell_input;
+    NEActivationLayer                                                _input_gate_sigmoid;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_forget_cell;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_input_cell;
+    NEArithmeticAddition                                             _add_forget_cell;
+    NEActivationLayer                                                _cell_clip;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_output;
+    NEGEMMLowpOutputStage                                            _input_to_output_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_output;
+    NEGEMMLowpOutputStage                                            _recurrent_to_output_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_output;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_output;
+    NEGEMMLowpOutputStage                                            _cell_to_output_outstage;
+    NEArithmeticAddition                                             _accumulate_cell_to_output;
+    NEActivationLayer                                                _output_gate_sigmoid;
+    NEActivationLayer                                                _hidden_tanh;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_hidden;
+    NEGEMMLowpOutputStage                                            _hidden_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_projection;
+    NEGEMMLowpOutputStage                                            _projection_outstage;
+    NEArithmeticAddition                                             _accumulate_projection;
+    NEActivationLayer                                                _projection_clip;
+
+    TensorCopyKernel _projection_bias_copy;
+    TensorCopyKernel _projection_output_to_accumulate_copy;
+    TensorCopyKernel _projection_accumulate_to_output_copy;
+    TensorCopyKernel _hidden_to_output_copy;
 
-    std::array<NEQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
+    std::array<std::unique_ptr<NEQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
 
-    NECopyKernel _copy_output{};
+    NECopy _copy_output;
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights{ nullptr };
-    const ITensor *_recurrent_to_input_weights{ nullptr };
-    const ITensor *_projection_bias{ nullptr };
-    const ITensor *_input_to_forget_weights{ nullptr };
-    const ITensor *_input_to_cell_weights{ nullptr };
-    const ITensor *_input_to_output_weights{ nullptr };
-    const ITensor *_recurrent_to_forget_weights{ nullptr };
-    const ITensor *_recurrent_to_cell_weights{ nullptr };
-    const ITensor *_recurrent_to_output_weights{ nullptr };
-    const ITensor *_projection_weights{ nullptr };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    const ITensor                                 *_input_to_input_weights{nullptr};
+    const ITensor                                 *_recurrent_to_input_weights{nullptr};
+    const ITensor                                 *_projection_bias{nullptr};
+    const ITensor                                 *_input_to_forget_weights{nullptr};
+    const ITensor                                 *_input_to_cell_weights{nullptr};
+    const ITensor                                 *_input_to_output_weights{nullptr};
+    const ITensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ITensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ITensor                                 *_recurrent_to_output_weights{nullptr};
+    const ITensor                                 *_projection_weights{nullptr};
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -350,99 +399,87 @@ private:
         return _layer_norm_bias[getGateIndex(g)];
     }
 
-    inline NEQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
+    inline std::unique_ptr<NEQLSTMLayerNormalizationKernel> &get_layer_norm(LayerNormGate g)
     {
         return _layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ITensor *in)
-    {
-        ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
-
-        Tensor &out = get_layer_norm_output(g);
-        _memory_group.manage(&out);
-        out.allocator()->init(*(in->info()));
-
-        get_layer_norm(g).configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
-    }
-
-    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
-    {
-        // Output quantization scale will be different, but ignored here
-        // since it will be configured at configure() stage.
-        const TensorInfo out{ in };
-        return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
-    }
+    void          configure_layer_norm(LayerNormGate g, const ITensor *in);
+    static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    Tensor _input_to_forget_weights_transposed{ nullptr };
-    Tensor _input_to_cell_weights_transposed{ nullptr };
-    Tensor _input_to_output_weights_transposed{ nullptr };
-    Tensor _input_to_input_weights_transposed{ nullptr };
-    Tensor _recurrent_to_forget_weights_transposed{ nullptr };
-    Tensor _recurrent_to_cell_weights_transposed{ nullptr };
-    Tensor _recurrent_to_output_weights_transposed{ nullptr };
-    Tensor _recurrent_to_input_weights_transposed{ nullptr };
-    Tensor _projection_weights_transposed{ nullptr };
-    Tensor _input_to_input_eff_bias{ nullptr };
-    Tensor _recurrent_to_input_eff_bias{ nullptr };
-    Tensor _input_to_forget_eff_bias{ nullptr };
-    Tensor _recurrent_to_forget_eff_bias{ nullptr };
-    Tensor _input_to_cell_eff_bias{ nullptr };
-    Tensor _recurrent_to_cell_eff_bias{ nullptr };
-    Tensor _input_to_output_eff_bias{ nullptr };
-    Tensor _recurrent_to_output_eff_bias{ nullptr };
-    Tensor _projection_reduction_res{ nullptr };
-    Tensor _projection_eff_bias{ nullptr };
-    Tensor _mm_input_to_forget_res{ nullptr };
-    Tensor _mm_recurrent_to_forget_res{ nullptr };
-    Tensor _mul_cell_to_forget_res{ nullptr };
-    Tensor _input_to_forget_outstage_res{ nullptr };
-    Tensor _cell_to_forget_outstage_res{ nullptr };
-    Tensor _recurrent_to_forget_outstage_res{ nullptr };
-    Tensor _forget_gate{ nullptr };
-    Tensor _mm_input_to_cell_res{ nullptr };
-    Tensor _input_to_cell_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_cell_res{ nullptr };
-    Tensor _recurrent_to_cell_outstage_res{ nullptr };
-    Tensor _cell_gate{ nullptr };
-    Tensor _mul_input_cell_res{ nullptr };
-    Tensor _mm_input_to_input_res{ nullptr };
-    Tensor _input_to_input_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_input_res{ nullptr };
-    Tensor _mul_cell_to_input_res{ nullptr };
-    Tensor _cell_to_input_outstage_res{ nullptr };
-    Tensor _recurrent_to_input_outstage_res{ nullptr };
-    Tensor _input_gate{ nullptr };
-    Tensor _mm_input_to_output_res{ nullptr };
-    Tensor _input_to_output_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_output_res{ nullptr };
-    Tensor _mul_cell_to_output_res{ nullptr };
-    Tensor _cell_to_output_outstage_res{ nullptr };
-    Tensor _recurrent_to_output_outstage_res{ nullptr };
-    Tensor _output_gate{ nullptr };
-    Tensor _hidden_mul_res{ nullptr };
-    Tensor _hidden_gate{ nullptr };
-    Tensor _mm_projection_res{ nullptr };
-    Tensor _projection_outstage_res{ nullptr };
-    Tensor _projection_out_res{ nullptr };
-    Tensor _projection_accumulate_res{ nullptr };
-    Tensor _ones{ nullptr };
-    std::array<Tensor, _layer_norm_count> _layer_norm_output{ {} };
+    Tensor _input_to_forget_weights_f32{nullptr};
+    Tensor _input_to_forget_weights_symm8{nullptr};
+
+    Tensor                                _input_to_forget_weights_transposed{nullptr};
+    Tensor                                _input_to_cell_weights_transposed{nullptr};
+    Tensor                                _input_to_output_weights_transposed{nullptr};
+    Tensor                                _input_to_input_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_output_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_input_weights_transposed{nullptr};
+    Tensor                                _projection_weights_transposed{nullptr};
+    Tensor                                _input_to_input_eff_bias{nullptr};
+    Tensor                                _recurrent_to_input_eff_bias{nullptr};
+    Tensor                                _input_to_forget_eff_bias{nullptr};
+    Tensor                                _recurrent_to_forget_eff_bias{nullptr};
+    Tensor                                _input_to_cell_eff_bias{nullptr};
+    Tensor                                _recurrent_to_cell_eff_bias{nullptr};
+    Tensor                                _input_to_output_eff_bias{nullptr};
+    Tensor                                _recurrent_to_output_eff_bias{nullptr};
+    Tensor                                _projection_reduction_res{nullptr};
+    Tensor                                _projection_eff_bias{nullptr};
+    Tensor                                _mm_input_to_forget_res{nullptr};
+    Tensor                                _mm_recurrent_to_forget_res{nullptr};
+    Tensor                                _mul_cell_to_forget_res{nullptr};
+    Tensor                                _input_to_forget_outstage_res{nullptr};
+    Tensor                                _cell_to_forget_outstage_res{nullptr};
+    Tensor                                _recurrent_to_forget_outstage_res{nullptr};
+    Tensor                                _forget_gate{nullptr};
+    Tensor                                _mm_input_to_cell_res{nullptr};
+    Tensor                                _input_to_cell_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_cell_res{nullptr};
+    Tensor                                _recurrent_to_cell_outstage_res{nullptr};
+    Tensor                                _cell_gate{nullptr};
+    Tensor                                _mul_input_cell_res{nullptr};
+    Tensor                                _mm_input_to_input_res{nullptr};
+    Tensor                                _input_to_input_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_input_res{nullptr};
+    Tensor                                _mul_cell_to_input_res{nullptr};
+    Tensor                                _cell_to_input_outstage_res{nullptr};
+    Tensor                                _recurrent_to_input_outstage_res{nullptr};
+    Tensor                                _input_gate{nullptr};
+    Tensor                                _mm_input_to_output_res{nullptr};
+    Tensor                                _input_to_output_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_output_res{nullptr};
+    Tensor                                _mul_cell_to_output_res{nullptr};
+    Tensor                                _cell_to_output_outstage_res{nullptr};
+    Tensor                                _recurrent_to_output_outstage_res{nullptr};
+    Tensor                                _output_gate{nullptr};
+    Tensor                                _hidden_mul_res{nullptr};
+    Tensor                                _hidden_gate{nullptr};
+    Tensor                                _mm_projection_res{nullptr};
+    Tensor                                _projection_outstage_res{nullptr};
+    Tensor                                _projection_out_res{nullptr};
+    Tensor                                _projection_accumulate_res{nullptr};
+    Tensor                                _ones{nullptr};
+    std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
     {
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
+    bool _convert_input_to_forget_weights_to_qsymm8{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */