1 files changed, 68 insertions, 39 deletions
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index a354a4df7b..ae951669b3 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -38,8 +39,6 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
-
 namespace arm_compute
 {
 // Forward declarations
@@ -47,10 +46,10 @@ class ITensor;
 
 /** Basic function to run @ref NELSTMLayerQuantized
  *
- * This function calls the following NEON functions/kernels:
+ * This function calls the following functions/kernels:
  *
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
+ * -# @ref NEGEMMLowpOutputStage                                 Convert 32-bit integers into QSYMM16
  * -# @ref NETranspose                                           Matrix transpose
  * -# @ref NEConcatenateLayer                                    Tensor concatenation
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
@@ -77,6 +76,14 @@ public:
     ~NELSTMLayerQuantized();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src8 |src9 - src12 |src13   |src14  |dst0   |dst1   |
+     * |:-----------|:------------|:-------|:------|:------|:------|
+     * |QASYMM8     |S32          |QSYMM16 |QASYMM8|QSYMM16|QASYMM8|
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
      * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
@@ -96,11 +103,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ITensor *input,
-                   const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out);
+                   const ITensor *input_to_input_weights,
+                   const ITensor *input_to_forget_weights,
+                   const ITensor *input_to_cell_weights,
+                   const ITensor *input_to_output_weights,
+                   const ITensor *recurrent_to_input_weights,
+                   const ITensor *recurrent_to_forget_weights,
+                   const ITensor *recurrent_to_cell_weights,
+                   const ITensor *recurrent_to_output_weights,
+                   const ITensor *input_gate_bias,
+                   const ITensor *forget_gate_bias,
+                   const ITensor *cell_bias,
+                   const ITensor *output_gate_bias,
+                   ITensor       *cell_state_in,
+                   const ITensor *output_state_in,
+                   ITensor       *cell_state_out,
+                   ITensor       *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -125,11 +143,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
@@ -139,30 +168,30 @@ private:
     MemoryGroup _memory_group;
 
     // Functions used
-    NEGEMMLowpMatrixMultiplyCore                        _gemmlowp;
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
-    NETranspose                                         _transpose_weights;
-    NEConcatenateLayer                                  _concat_input_weights;
-    NEConcatenateLayer                                  _concat_recurrent_weights;
-    NEConcatenateLayer                                  _concat_weights;
-    NEConcatenateLayer                                  _concat_inputs;
-    NEConcatenateLayer                                  _concat_bias;
-    NEActivationLayer                                   _sigmoid_forget_gate;
-    NEActivationLayer                                   _sigmoid_input_gate;
-    NEActivationLayer                                   _sigmoid_output_gate;
-    NEActivationLayer                                   _tanh_modulation_gate;
-    NEActivationLayer                                   _tanh_output_state;
-    NEArithmeticAddition                                _add1;
-    NEArithmeticAddition                                _add2;
-    NEPixelWiseMultiplication                           _mul1;
-    NEPixelWiseMultiplication                           _mul2;
-    NEPixelWiseMultiplication                           _mul3;
-    NESlice                                             _slice_input_tensor;
-    NESlice                                             _slice_forget_tensor;
-    NESlice                                             _slice_cell_tensor;
-    NESlice                                             _slice_output_tensor;
-    NEDequantizationLayer                               _dequantize;
-    NEQuantizationLayer                                 _quantize;
+    NEGEMMLowpMatrixMultiplyCore _gemmlowp;
+    NEGEMMLowpOutputStage        _output_stage;
+    NETranspose                  _transpose_weights;
+    NEConcatenateLayer           _concat_input_weights;
+    NEConcatenateLayer           _concat_recurrent_weights;
+    NEConcatenateLayer           _concat_weights;
+    NEConcatenateLayer           _concat_inputs;
+    NEConcatenateLayer           _concat_bias;
+    NEActivationLayer            _sigmoid_forget_gate;
+    NEActivationLayer            _sigmoid_input_gate;
+    NEActivationLayer            _sigmoid_output_gate;
+    NEActivationLayer            _tanh_modulation_gate;
+    NEActivationLayer            _tanh_output_state;
+    NEArithmeticAddition         _add1;
+    NEArithmeticAddition         _add2;
+    NEPixelWiseMultiplication    _mul1;
+    NEPixelWiseMultiplication    _mul2;
+    NEPixelWiseMultiplication    _mul3;
+    NESlice                      _slice_input_tensor;
+    NESlice                      _slice_forget_tensor;
+    NESlice                      _slice_cell_tensor;
+    NESlice                      _slice_output_tensor;
+    NEDequantizationLayer        _dequantize;
+    NEQuantizationLayer          _quantize;
 
     // Tensor pointers
     const ITensor *_input_to_input_weights;