From ba27e4467dfc04e23ce9483330be062e9aaebdc5 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Tue, 28 May 2019 10:04:57 +0100
Subject: COMPMID-2236: QUANTIZED_16BIT_LSTM operator for NEON

Change-Id: I554023508e09b790ecc1bbdada529697d6c7b616
Signed-off-by: giuros01 <giuseppe.rossini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1551
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
---
 .../NEON/kernels/NEDequantizationLayerKernel.h     |   4 +-
 arm_compute/core/Validate.h                        |   4 +-
 arm_compute/runtime/NEON/NEFunctions.h             |   1 +
 .../runtime/NEON/functions/NEDequantizationLayer.h |   4 +-
 .../runtime/NEON/functions/NELSTMLayerQuantized.h  | 205 +++++++++
 .../runtime/NEON/functions/NEQuantizationLayer.h   |   4 +-
 .../NEON/kernels/NEDequantizationLayerKernel.cpp   |  71 +++-
 src/core/NEON/kernels/NEStridedSliceKernel.cpp     |   2 +-
 .../NEON/kernels/NEWidthConcatenateLayerKernel.cpp |   2 +-
 .../NEON/functions/NELSTMLayerQuantized.cpp        | 376 +++++++++++++++++
 tests/validation/NEON/LSTMLayerQuantized.cpp       | 458 +++++++++++++++++++++
 11 files changed, 1119 insertions(+), 12 deletions(-)
 create mode 100644 arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
 create mode 100644 src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
 create mode 100644 tests/validation/NEON/LSTMLayerQuantized.cpp

diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
index 3320ba6889..f0a2a57d1a 100644
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
@@ -52,13 +52,13 @@ public:
     ~NEDequantizationLayerKernel() = default;
     /** Set input, output tensors.
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QSYMM8.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayerKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data types supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index dab4221a3b..37c7b50ec7 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -565,7 +565,7 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
     DataType             &&first_data_type         = tensor_info_1->data_type();
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    if(!is_data_type_quantized_asymmetric(first_data_type))
+    if(!is_data_type_quantized(first_data_type))
     {
         return arm_compute::Status{};
     }
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index d44afcbb0f..b59f24eed5 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -95,6 +95,7 @@
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 #include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 #include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index 8c24b38cee..c08366e5a7 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -39,13 +39,13 @@ class NEDequantizationLayer : public INESimpleFunctionNoBorder
 public:
     /** Configure the kernel.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
new file mode 100644
index 0000000000..b45d714990
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELSTMLAYERQUANTIZED_H__
+#define __ARM_COMPUTE_NELSTMLAYERQUANTIZED_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESlice.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+
+#include "arm_compute/runtime/common/LSTMParams.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NELSTMLayerQuantized
+ *
+ * This function calls the following NEON functions/kernels:
+ *
+ * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
+ * -# @ref NETranspose                                           Matrix transpose
+ * -# @ref NEConcatenateLayer                                    Tensor concatenation
+ * -# @ref NEActivationLayer                                     Activation functions (tanh and logistig)
+ * -# @ref NEArithmeticAddition                                  Elementwise addition
+ * -# @ref NEPixelWiseMultiplication                             Elementwise multiplication
+ * -# @ref NESlice                                               Tensor slicing
+ * -# @ref NEDequantizationLayer                                 Dequantize into float
+ * -# @ref NEQuantizationLayer                                   Quantize from float
+ * */
+class NELSTMLayerQuantized : public IFunction
+{
+public:
+    /** Default constructor */
+    NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELSTMLayerQuantized(const NELSTMLayerQuantized &) = delete;
+    /** Default move constructor */
+    NELSTMLayerQuantized(NELSTMLayerQuantized &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELSTMLayerQuantized &operator=(const NELSTMLayerQuantized &) = delete;
+    /** Default move assignment operator */
+    NELSTMLayerQuantized &operator=(NELSTMLayerQuantized &&) = default;
+    /** Initialize function's tensors.
+     *
+     * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
+     * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_output_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_input_weights  2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_cell_weights   2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_output_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_gate_bias             1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  forget_gate_bias            1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_bias                   1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  output_gate_bias            1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_state_in               2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[in]  output_state_in             2D tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
+     * @param[out] cell_state_out              Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
+     */
+    void configure(const ITensor *input,
+                   const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
+                   const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
+                   const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
+                   ITensor *cell_state_in, const ITensor *output_state_in,
+                   ITensor *cell_state_out, ITensor *output_state_out);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
+     *
+     * @param[in]  input                       Source tensor info. Input is a 2D tensor info with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
+     * @param[in]  input_to_input_weights      2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_forget_weights     2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_cell_weights       2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_output_weights     2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_input_weights  2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_forget_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_cell_weights   2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_output_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_gate_bias             1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  forget_gate_bias            1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_bias                   1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  output_gate_bias            1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_state_in               2D tensor info with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[in]  output_state_in             2D tensor info with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
+     * @param[out] cell_state_out              Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[out] output_state_out            Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size].Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    MemoryGroup _memory_group;
+
+    // Functions used
+    NEGEMMLowpMatrixMultiplyCore                        _gemmlowp;
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
+    NETranspose                                         _transpose_weights;
+    NEConcatenateLayer                                  _concat_input_weights;
+    NEConcatenateLayer                                  _concat_recurrent_weights;
+    NEConcatenateLayer                                  _concat_weights;
+    NEConcatenateLayer                                  _concat_inputs;
+    NEConcatenateLayer                                  _concat_bias;
+    NEActivationLayer                                   _sigmoid_forget_gate;
+    NEActivationLayer                                   _sigmoid_input_gate;
+    NEActivationLayer                                   _sigmoid_output_gate;
+    NEActivationLayer                                   _tanh_modulation_gate;
+    NEActivationLayer                                   _tanh_output_state;
+    NEArithmeticAddition                                _add1;
+    NEArithmeticAddition                                _add2;
+    NEPixelWiseMultiplication                           _mul1;
+    NEPixelWiseMultiplication                           _mul2;
+    NEPixelWiseMultiplication                           _mul3;
+    NESlice                                             _slice_input_tensor;
+    NESlice                                             _slice_forget_tensor;
+    NESlice                                             _slice_cell_tensor;
+    NESlice                                             _slice_output_tensor;
+    NEDequantizationLayer                               _dequantize;
+    NEQuantizationLayer                                 _quantize;
+
+    // Tensor pointers
+    const ITensor *_input_to_input_weights;
+    const ITensor *_input_to_forget_weights;
+    const ITensor *_input_to_cell_weights;
+    const ITensor *_input_to_output_weights;
+    const ITensor *_recurrent_to_input_weights;
+    const ITensor *_recurrent_to_forget_weights;
+    const ITensor *_recurrent_to_cell_weights;
+    const ITensor *_recurrent_to_output_weights;
+    const ITensor *_input_gate_bias;
+    const ITensor *_forget_gate_bias;
+    const ITensor *_cell_bias;
+    const ITensor *_output_gate_bias;
+
+    // Temporary tensors
+    Tensor _recurrent_weights;
+    Tensor _input_weights;
+    Tensor _weights;
+    Tensor _input;
+    Tensor _weights_transposed;
+    Tensor _output_highp;
+    Tensor _output_lowp;
+    Tensor _bias;
+    Tensor _forget_gate_input;
+    Tensor _input_gate_input;
+    Tensor _output_gate_input;
+    Tensor _input_modulation_gate_input;
+    Tensor _forget_gate_output;
+    Tensor _input_gate_output;
+    Tensor _output_gate_output;
+    Tensor _input_modulation_gate_output;
+    Tensor _cell_state1;
+    Tensor _cell_state2;
+    Tensor _output_state_tmp;
+    Tensor _output_state_out_symm;
+    Tensor _output_state_out_f32;
+
+    bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NELSTMLAYERQUANTIZED_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index 5e4b4f754c..46a62bd903 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -49,13 +49,13 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8
+     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QSYMM16
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayer
      *
      * @param[in] input  Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16.
-     * @param[in] output Output tensor info. Data types supported: QASYMM8
+     * @param[in] output Output tensor info. Data types supported: QASYMM8/QSYMM16
      *
      * @return a status
      */
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index bf0a2ca7bf..d11f04a82f 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NESymm.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -42,7 +43,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QSYMM16);
 
     if(output->tensor_shape().total_size() > 0)
     {
@@ -94,6 +95,27 @@ inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
+template <typename T>
+inline void store_result(T *ptr, const float32x4x2_t &v)
+{
+    ARM_COMPUTE_UNUSED(ptr, v);
+}
+
+template <>
+inline void store_result<float>(float *ptr, const float32x4x2_t &v)
+{
+    wrapper::vstore(ptr, v.val[0]);
+    wrapper::vstore(ptr + 4, v.val[1]);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
+{
+    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
 template <typename T>
 void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
 {
@@ -179,6 +201,48 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind
     in, out);
 }
 
+template <typename T>
+void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize_int16(vin, scale);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int16_t val    = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+        }
+    },
+    in, out);
+}
+
 template <typename T>
 void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
 {
@@ -190,6 +254,9 @@ void run_dequantization_core(const ITensor *input, ITensor *output, const Window
         case DataType::QSYMM8:
             run_dequantization_qsymm8<T>(input, output, window);
             break;
+        case DataType::QSYMM16:
+            run_dequantization_qsymm16<T>(input, output, window);
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
@@ -244,4 +311,4 @@ void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &in
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index ece291e0a3..c33e699999 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -45,7 +45,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
+                                                         DataType::U16, DataType::S16, DataType::QSYMM16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
 
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 28f655c529..7b1ad9c2e8 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -61,7 +61,7 @@ Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, c
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8,
                                                          DataType::U16, DataType::S16, DataType::F16,
-                                                         DataType::U32, DataType::F32);
+                                                         DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
 
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
new file mode 100644
index 0000000000..05e05a5e57
--- /dev/null
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+// Quantization info structures used in the LSTMQuantize layer
+const QuantizationInfo qasymm(1.f / 128.f, 128);
+const QuantizationInfo qsymm_3(8.f / 32768.f, 0);  // qsymm16 with 3 integer bit
+const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
+const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
+} // namespace
+
+NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
+      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
+      _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
+      _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
+      _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+      _is_prepared(false)
+{
+}
+
+void NELSTMLayerQuantized::configure(const ITensor *input,
+                                     const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
+                                     const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
+                                     const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
+                                     ITensor *cell_state_in, const ITensor *output_state_in,
+                                     ITensor *cell_state_out, ITensor *output_state_out)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+                                                              input_to_output_weights->info(),
+                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+    const int input_size  = input->info()->dimension(0);
+    const int batch_size  = input->info()->dimension(1);
+    const int output_size = input_to_input_weights->info()->dimension(1);
+
+    const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
+
+    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+
+    _input_to_input_weights      = input_to_input_weights;
+    _input_to_forget_weights     = input_to_forget_weights;
+    _input_to_cell_weights       = input_to_cell_weights;
+    _input_to_output_weights     = input_to_output_weights;
+    _recurrent_to_input_weights  = recurrent_to_input_weights;
+    _recurrent_to_forget_weights = recurrent_to_forget_weights;
+    _recurrent_to_cell_weights   = recurrent_to_cell_weights;
+    _recurrent_to_output_weights = recurrent_to_output_weights;
+    _input_gate_bias             = input_gate_bias;
+    _forget_gate_bias            = forget_gate_bias;
+    _cell_bias                   = cell_bias;
+    _output_gate_bias            = output_gate_bias;
+
+    // Weights concatenation
+    std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
+    std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+
+    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
+
+    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
+
+    std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
+    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_weights.configure(weights_vector, &_weights, Window::DimX);
+    _transpose_weights.configure(&_weights, &_weights_transposed);
+
+    // Input concatenation
+    std::vector<const ITensor *> input_vector{ input, output_state_in };
+    _memory_group.manage(&_input);
+    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _concat_inputs.configure(input_vector, &_input, Window::DimX);
+
+    // Bias concatenation
+    std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+    _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
+    _concat_bias.configure(bias_vector, &_bias, Window::DimX);
+
+    // Invert the offset for gemmlowp
+    _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+
+    // Run gemmlowp
+    _memory_group.manage(&_output_highp);
+    _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32));
+    _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp);
+    _input.allocator()->allocate();
+
+    // Set the offset back
+    _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+
+    // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
+    _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
+
+    const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+    _memory_group.manage(&_output_lowp);
+    _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+    _output_highp.allocator()->allocate();
+    _bias.allocator()->allocate();
+
+    // Get the gate tensors
+    _memory_group.manage(&_input_gate_input);
+    _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+    _memory_group.manage(&_forget_gate_input);
+    _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+    _memory_group.manage(&_input_modulation_gate_input);
+    _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+    _memory_group.manage(&_output_gate_input);
+    _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+    _output_lowp.allocator()->allocate();
+
+    // Forget gate
+    _memory_group.manage(&_forget_gate_output);
+    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_input.allocator()->allocate();
+
+    // Input gate
+    _memory_group.manage(&_input_gate_output);
+    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_input.allocator()->allocate();
+
+    // Input modulation gate equation
+    _memory_group.manage(&_input_modulation_gate_output);
+    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_input.allocator()->allocate();
+
+    // Output gate
+    _memory_group.manage(&_output_gate_output);
+    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_input.allocator()->allocate();
+
+    // Long term memory
+    _memory_group.manage(&_cell_state1);
+    _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _forget_gate_output.allocator()->allocate();
+
+    _memory_group.manage(&_cell_state2);
+    _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _input_modulation_gate_output.allocator()->allocate();
+    _input_gate_output.allocator()->allocate();
+
+    _add1.configure(&_cell_state1, &_cell_state2, cell_state_out, ConvertPolicy::SATURATE);
+    _cell_state1.allocator()->allocate();
+    _cell_state2.allocator()->allocate();
+
+    // Short term memory
+    _memory_group.manage(&_output_state_tmp);
+    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+
+    _memory_group.manage(&_output_state_out_symm);
+    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_gate_output.allocator()->allocate();
+    _output_state_tmp.allocator()->allocate();
+
+    // Requantize the output state from QSYMM16 to QASYMM8
+    _memory_group.manage(&_output_state_out_f32);
+    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
+    _output_state_out_symm.allocator()->allocate();
+
+    _quantize.configure(&_output_state_out_f32, output_state_out);
+    _output_state_out_f32.allocator()->allocate();
+}
+
+Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
+                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+                                        output_state_in, cell_state_out, output_state_out);
+
+    const int input_size  = input->dimension(0);
+    const int batch_size  = input->dimension(1);
+    const int output_size = input_to_input_weights->dimension(1);
+
+    // Dimensionality checks
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_input_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+
+    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
+    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+
+    // Shape checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
+
+    // Data type checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
+
+    // Quantization checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
+
+    if(cell_state_out->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
+    }
+
+    if(output_state_out->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_out);
+    }
+
+    return Status{};
+}
+
+void NELSTMLayerQuantized::run()
+{
+    prepare();
+
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Concat and transpose the input
+    _concat_inputs.run();
+
+    // Run gemmlowp
+    _gemmlowp.run();
+    _output_stage.run();
+
+    // Slice the results
+    _slice_input_tensor.run();
+    _slice_forget_tensor.run();
+    _slice_cell_tensor.run();
+    _slice_output_tensor.run();
+
+    // Gates
+    // Forget gate
+    _sigmoid_forget_gate.run();
+
+    // Input gate
+    _sigmoid_input_gate.run();
+
+    // Input modulation gate
+    _tanh_modulation_gate.run();
+
+    // Output gate
+    _sigmoid_output_gate.run();
+
+    // Cell state (long term memory)
+    _mul1.run();
+    _mul2.run();
+    _add1.run();
+
+    // Output state (short term memory)
+    _tanh_output_state.run();
+    _mul3.run();
+
+    // Requantize output state from QSYMM16 to QASYMM16
+    _dequantize.run();
+    _quantize.run();
+}
+
+void NELSTMLayerQuantized::prepare()
+{
+    if(!_is_prepared)
+    {
+        _input_weights.allocator()->allocate();
+        _concat_input_weights.run();
+
+        _input_to_input_weights->mark_as_unused();
+        _input_to_forget_weights->mark_as_unused();
+        _input_to_cell_weights->mark_as_unused();
+        _input_to_output_weights->mark_as_unused();
+
+        _recurrent_weights.allocator()->allocate();
+        _concat_recurrent_weights.run();
+        _recurrent_to_input_weights->mark_as_unused();
+        _recurrent_to_forget_weights->mark_as_unused();
+        _recurrent_to_cell_weights->mark_as_unused();
+        _recurrent_to_output_weights->mark_as_unused();
+
+        _weights.allocator()->allocate();
+        _concat_weights.run();
+
+        _input_weights.mark_as_unused();
+        _input_weights.allocator()->free();
+        _recurrent_weights.mark_as_unused();
+        _recurrent_weights.allocator()->free();
+
+        _weights_transposed.allocator()->allocate();
+        _transpose_weights.run();
+
+        _weights.mark_as_unused();
+        _weights.allocator()->free();
+
+        _bias.allocator()->allocate();
+        _concat_bias.run();
+        _input_gate_bias->mark_as_unused();
+        _forget_gate_bias->mark_as_unused();
+        _cell_bias->mark_as_unused();
+        _output_gate_bias->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
+
+} // namespace arm_compute
diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp
new file mode 100644
index 0000000000..41c12c91e7
--- /dev/null
+++ b/tests/validation/NEON/LSTMLayerQuantized.cpp
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/Utils.h"
+#include "tests/datasets/LSTMLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename T>
+inline void fill_tensor(Tensor &tensor, const std::vector<T> &v)
+{
+    // Import memory accounting for padding
+    TensorShape t_shape = tensor.info()->tensor_shape();
+    Window      window;
+    window.use_tensor_dimensions(t_shape);
+    Iterator out(&tensor, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        *reinterpret_cast<T *>(out.ptr()) = v[coord2index(t_shape, id)];
+    },
+    out);
+}
+
+template <typename T>
+inline void fill_tensor(SimpleTensor<T> &tensor, const std::vector<T> &v)
+{
+    std::memcpy(tensor.data(), v.data(), sizeof(T) * v.size());
+}
+
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(LSTMLayerQuantized)
+
+// *INDENT-OFF*
+// clang-format off
+TEST_CASE(IntegrationTestCaseSmall, framework::DatasetMode::PRECOMMIT)
+{
+    const int batch_size  = 2;
+    const int input_size  = 2;
+    const int output_size = 4;
+
+
+    QuantizationInfo qasymm(1.f / 128.f, 128);
+    QuantizationInfo qweights(1.f / 128.f, 128);
+    QuantizationInfo qsymm_3(8.f / 32768.f, 0);
+    QuantizationInfo qsymm_4(16.f / 32768.f, 0);
+
+    TensorShape input_shape{ input_size, batch_size };
+    TensorShape input_weights_shape{ input_size, output_size };
+    TensorShape recurrent_weights_shape{ output_size, output_size };
+    TensorShape output_shape{ output_size, batch_size};
+    TensorShape bias_shape{ output_size };
+
+    auto input_to_input_weights      = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_forget_weights     = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_cell_weights       = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_output_weights     = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_input_weights  = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_forget_weights = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_cell_weights   = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_output_weights = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_gate_bias             = create_tensor<Tensor>(bias_shape, DataType::S32);
+    auto forget_gate_bias            = create_tensor<Tensor>(bias_shape, DataType::S32);
+    auto cell_gate_bias              = create_tensor<Tensor>(bias_shape, DataType::S32);
+    auto output_gate_bias            = create_tensor<Tensor>(bias_shape, DataType::S32);
+
+    // LSTM input
+    auto input = create_tensor<Tensor>(input_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM output state
+    auto output_state = create_tensor<Tensor>(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM cell state
+    auto cell_state = create_tensor<Tensor>(output_shape, DataType::QSYMM16, 1, qsymm_4);
+
+    NELSTMLayerQuantized lstmq;
+
+    lstmq.configure(&input, &input_to_input_weights, &input_to_forget_weights, &input_to_cell_weights, &input_to_output_weights,
+                    &recurrent_to_input_weights, &recurrent_to_forget_weights, &recurrent_to_cell_weights, &recurrent_to_output_weights,
+                    &input_gate_bias, &forget_gate_bias, &cell_gate_bias, &output_gate_bias, &cell_state, &output_state, &cell_state, &output_state);
+
+    input.allocator()->allocate();
+    input_to_input_weights.allocator()->allocate();
+    input_to_forget_weights.allocator()->allocate();
+    input_to_cell_weights.allocator()->allocate();
+    input_to_output_weights.allocator()->allocate();
+    recurrent_to_input_weights.allocator()->allocate();
+    recurrent_to_forget_weights.allocator()->allocate();
+    recurrent_to_cell_weights.allocator()->allocate();
+    recurrent_to_output_weights.allocator()->allocate();
+    input_gate_bias.allocator()->allocate();
+    forget_gate_bias.allocator()->allocate();
+    cell_gate_bias.allocator()->allocate();
+    output_gate_bias.allocator()->allocate();
+    cell_state.allocator()->allocate();
+    output_state.allocator()->allocate();
+    cell_state.allocator()->allocate();
+    output_state.allocator()->allocate();
+
+    // Fill weights and biases
+    fill_tensor(input_to_input_weights, std::vector<uint8_t>{ 47,  168,
+                                                              66,  239,
+                                                               6,   42,
+                                                             237,  236 });
+
+    fill_tensor(input_to_forget_weights, std::vector<uint8_t> { 204,  193,
+                                                                148,  59,
+                                                                113,  17,
+                                                                 66, 197 });
+
+    fill_tensor(input_to_cell_weights, std::vector<uint8_t> { 172,  101,
+                                                              184, 209,
+                                                              165,  82,
+                                                              108, 209 });
+
+    fill_tensor(input_to_output_weights, std::vector<uint8_t> { 203, 244,
+                                                                219, 114,
+                                                                130,  16,
+                                                                163, 222 });
+
+    fill_tensor(recurrent_to_input_weights, std::vector<uint8_t> { 162, 168,  7,  95,
+                                                                    91, 155, 108, 216,
+                                                                   255, 100,  48, 188,
+                                                                    58,  37, 186, 147 });
+
+    fill_tensor(recurrent_to_forget_weights, std::vector<uint8_t> {  46,  58,  47, 170,
+                                                                    246,  96,  12,  99,
+                                                                     68,  23, 186, 161,
+                                                                    237, 164,  89,   6 });
+
+    fill_tensor(recurrent_to_cell_weights, std::vector<uint8_t> { 234,  99,   71, 206,
+                                                                  205, 159,   64, 253,
+                                                                  191, 148,  116,   8,
+                                                                  209, 136,   59, 138 });
+
+    fill_tensor(recurrent_to_output_weights, std::vector<uint8_t> {  23, 241, 137, 36,
+                                                                    206,   5, 227, 56,
+                                                                    254, 176, 231, 47,
+                                                                     18, 201, 161, 11 });
+
+    fill_tensor(input_gate_bias, std::vector<int>  {-103038,   30525,  115255, -38154 });
+    fill_tensor(forget_gate_bias, std::vector<int> { -23428,  126970,  116806,  46307 });
+    fill_tensor(cell_gate_bias, std::vector<int>   { 128006,   69949,  -42808,  42568 });
+    fill_tensor(output_gate_bias, std::vector<int> { -67066,  -53607,   47233,  7300  });
+
+    SimpleTensor<uint8_t> expected_output(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // Initialize state
+    fill_tensor(output_state, std::vector<uint8_t> { 128, 128, 128, 128,
+                                                     128, 128, 128, 128 });
+    fill_tensor(cell_state, std::vector<int16_t> { 0, 0, 0, 0,
+                                                   0, 0, 0, 0 });
+
+    // First input
+    fill_tensor(input, std::vector<uint8_t> { 106,  193,
+                                              155,  150 });
+
+    fill_tensor(expected_output, std::vector<uint8_t> { 128, 130,  36, 134,
+                                                        128, 131,  35, 133 });
+
+    lstmq.run();
+    validate(Accessor(output_state), expected_output);
+
+    // Second input
+    fill_tensor(expected_output, std::vector<uint8_t> { 128, 129, 12, 137,
+                                                        128, 131, 10, 136 });
+    lstmq.run();
+    validate(Accessor(output_state), expected_output);
+
+    // Third input
+    fill_tensor(expected_output, std::vector<uint8_t> { 128, 129, 8, 140,
+                                                        128, 130, 6, 138 });
+    lstmq.run();
+    validate(Accessor(output_state), expected_output);
+}
+
+TEST_CASE(IntegrationTestCaseLarge, framework::DatasetMode::PRECOMMIT)
+{
+    const int batch_size  = 16;
+    const int input_size  = 8;
+    const int output_size = 8;
+
+
+    QuantizationInfo qasymm(1.f / 128.f, 128);
+    QuantizationInfo qweights(1.f / 128.f, 128);
+    QuantizationInfo qsymm_3(8.f / 32768.f, 0);
+    QuantizationInfo qsymm_4(16.f / 32768.f, 0);
+
+    TensorShape input_shape{ input_size, batch_size };
+    TensorShape input_weights_shape{ input_size, output_size };
+    TensorShape recurrent_weights_shape{ output_size, output_size };
+    TensorShape output_shape{ output_size, batch_size};
+    TensorShape bias_shape{ output_size };
+
+    auto input_to_input_weights      = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_forget_weights     = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_cell_weights       = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_output_weights     = create_tensor<Tensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_input_weights  = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_forget_weights = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_cell_weights   = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_output_weights = create_tensor<Tensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_gate_bias             = create_tensor<Tensor>(bias_shape, DataType::S32);
+    auto forget_gate_bias            = create_tensor<Tensor>(bias_shape, DataType::S32);
+    auto cell_gate_bias              = create_tensor<Tensor>(bias_shape, DataType::S32);
+    auto output_gate_bias            = create_tensor<Tensor>(bias_shape, DataType::S32);
+
+    // LSTM input
+    auto input = create_tensor<Tensor>(input_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM output state
+    auto output_state = create_tensor<Tensor>(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM cell state
+    auto cell_state = create_tensor<Tensor>(output_shape, DataType::QSYMM16, 1, qsymm_4);
+
+    NELSTMLayerQuantized lstmq;
+
+    lstmq.configure(&input, &input_to_input_weights, &input_to_forget_weights, &input_to_cell_weights, &input_to_output_weights,
+                    &recurrent_to_input_weights, &recurrent_to_forget_weights, &recurrent_to_cell_weights, &recurrent_to_output_weights,
+                    &input_gate_bias, &forget_gate_bias, &cell_gate_bias, &output_gate_bias, &cell_state, &output_state, &cell_state, &output_state);
+
+    input.allocator()->allocate();
+    input_to_input_weights.allocator()->allocate();
+    input_to_forget_weights.allocator()->allocate();
+    input_to_cell_weights.allocator()->allocate();
+    input_to_output_weights.allocator()->allocate();
+    recurrent_to_input_weights.allocator()->allocate();
+    recurrent_to_forget_weights.allocator()->allocate();
+    recurrent_to_cell_weights.allocator()->allocate();
+    recurrent_to_output_weights.allocator()->allocate();
+    input_gate_bias.allocator()->allocate();
+    forget_gate_bias.allocator()->allocate();
+    cell_gate_bias.allocator()->allocate();
+    output_gate_bias.allocator()->allocate();
+    cell_state.allocator()->allocate();
+    output_state.allocator()->allocate();
+
+    // Fill weights and biases
+    fill_tensor(input_to_input_weights, std::vector<uint8_t>{ 141,  89, 200, 180,  46,  50,  87, 128,
+                                                              149, 227, 177, 187, 212, 229,  54, 111,
+                                                              131, 116,   3,  58, 196,  26, 131, 255,
+                                                               22, 106, 216,  69, 239,  12, 232, 207,
+                                                              184,  56, 236, 172,  28, 143, 161, 124,
+                                                              255,  33, 197, 122,  47, 197,  26, 229,
+                                                               91,  79,  11, 160,  26,  80, 100,  36,
+                                                              248, 186,  97,  61, 125,  46,  14, 100, });
+
+    fill_tensor(input_to_forget_weights, std::vector<uint8_t> { 237, 165, 141, 249,  72, 116, 36 , 115,
+                                                                234, 213,  85,  84,  59,  62, 150, 246,
+                                                                182, 102, 158, 214, 182, 183,  94,  11,
+                                                                158, 192,  92, 189, 160, 219, 206, 249,
+                                                                 88, 213, 193, 244, 151,  72, 129,  49,
+                                                                239,  83, 106,   9, 169, 187, 125, 171,
+                                                                 32, 141, 126,  92,  13,  36, 224, 150,
+                                                                187, 250, 178, 169,  89, 214,  91, 173 });
+
+    fill_tensor(input_to_cell_weights, std::vector<uint8_t> {  93, 103, 226, 139, 185, 252, 129, 171,
+                                                              159,  32,  25, 175, 224, 183, 165,  35,
+                                                              207,  69, 238, 228, 149, 214,  79,   6,
+                                                                5,  66, 102,  14,  19, 111,  36, 143,
+                                                               22,  85,  13,  78, 236, 121, 122,  77,
+                                                              249,  39,  88,  12, 205, 143,  93, 240,
+                                                              167,  89, 188,  50,  73,  69, 201, 251,
+                                                               59,  32, 203, 184, 139, 191, 199,  74});
+
+    fill_tensor(input_to_output_weights, std::vector<uint8_t> { 205,   7,  95, 104, 252, 143, 226,  73,
+                                                                229, 114, 152, 171, 221, 153,  73, 229,
+                                                                153, 165, 223, 239, 100,  38, 172, 211,
+                                                                226, 133, 239, 207, 116, 230, 170, 100,
+                                                                241,  95, 171, 124,  63, 115,  32, 127,
+                                                                141, 239,  53, 193, 201,  53, 104, 178,
+                                                                186, 212, 167, 107, 226, 230,  71, 213,
+                                                                148, 217,  19, 248, 233, 195, 183, 156 });
+
+    fill_tensor(recurrent_to_input_weights, std::vector<uint8_t> { 147, 112, 140, 103,   3, 255,  17,  49,
+                                                                    84, 112, 144, 213, 138, 142, 112,  66,
+                                                                   117,  30, 101,  35,  25, 132, 211, 229,
+                                                                   183, 208, 102,  16,  38,  85, 101, 152,
+                                                                   226,  83, 132,  22, 161, 110, 157, 129,
+                                                                   184,  63, 168,  42, 220, 126, 209, 157,
+                                                                     5,  88, 243,  83, 249,  19, 226, 209,
+                                                                   173,  96, 185,  77, 146, 227, 238, 136 });
+
+
+    fill_tensor(recurrent_to_forget_weights, std::vector<uint8_t> {  52, 132,  92, 200, 213,  32, 213,  37,
+                                                                    116, 142, 116, 180,   4, 172, 158, 143,
+                                                                    110,  40,  99,  28, 221, 153, 133,   2,
+                                                                    247, 144, 198, 100,  20,  15, 221, 196,
+                                                                    159, 178, 188, 151, 171,  15,  25, 217,
+                                                                    178, 109, 110, 118, 128,  39, 232, 234,
+                                                                    184, 214, 177,  13,  56,   6,  28, 252,
+                                                                     89, 187, 242,  59, 146, 111, 132, 129});
+
+    fill_tensor(recurrent_to_cell_weights, std::vector<uint8_t> {  70,  44, 137,  29,  36, 127,   1, 241,
+                                                                   26, 241, 142, 114,  67, 181,  49,  57,
+                                                                  131, 152, 175,  77,  23,  63,  37, 124,
+                                                                  150, 113,  95, 103, 110, 201,  69,  97,
+                                                                  196, 242,  62, 214,  66,  19,  45, 135,
+                                                                   22, 168, 149, 104,  77, 101,  36,  68,
+                                                                  170, 116, 222, 100, 109,   1, 154,  18,
+                                                                  133, 215, 105,  93,  31,  57, 231, 112 });
+
+
+    fill_tensor(recurrent_to_output_weights, std::vector<uint8_t> { 45 ,  181 ,  220 ,  219 ,   49  ,  63 ,   49  , 129,
+                                                                     7 ,  166 ,  104 ,  114 ,   83  ,  40 ,    1  , 195,
+                                                                   245 ,  142 ,   82 ,  232 ,  104  , 245 ,   82  , 196,
+                                                                   111 ,   56 ,  156 ,    9 ,  141  , 240 ,  180  , 148,
+                                                                   247 ,  198 ,  234 ,  137 ,   13  , 210 ,  161  , 192,
+                                                                   196 ,   59 ,  233 ,  184 ,  142  , 187 ,  140  , 166,
+                                                                     2 ,   95 ,  152 ,   46 ,   71  ,  46 ,  113  ,  32,
+                                                                   175 ,  229 ,   86 ,   87 ,   62  ,  93 ,   74  , 130});
+
+    fill_tensor(input_gate_bias, std::vector<int>  {  -40040, -106916,  -92315,  -79123,   45160, -17954,   50962, -63758 });
+    fill_tensor(forget_gate_bias, std::vector<int> { -128514,    8463,  -57831,  116977,  106547, -28132, -124557,  44941 });
+    fill_tensor(cell_gate_bias, std::vector<int>   { 88388  ,  123601, -116148,  -13022,   21619,  48926,   57523,  39332 });
+    fill_tensor(output_gate_bias, std::vector<int> {  59485 ,  -33070,   21386, -100633, -115959, 125768,  -56407,  24897 });
+
+    SimpleTensor<uint8_t> expected_output(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // Initialize state
+    fill_tensor(output_state, std::vector<uint8_t> { 128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128 });
+
+    fill_tensor(cell_state, std::vector<int16_t> { 0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0});
+
+    // First input
+    fill_tensor(input, std::vector<uint8_t> { 247,  203, 159, 131, 182, 114, 207, 195,
+                                              48 ,  61 , 154,  16,  80, 101, 116, 255,
+                                              50 , 115 ,  45, 186,  75, 212,  98,  48,
+                                              88 , 146 ,  24, 143, 218, 174, 203, 200,
+                                             239 ,  16 ,  66, 136, 234,  54,  94,  51,
+                                             101 , 128 , 220, 213, 164,  82, 137, 255,
+                                              70 , 165 , 234, 220,  66,  35, 183, 206,
+                                              39 ,  57 , 180, 202,  23, 172, 224, 109,
+                                             102 , 215 , 186,  82, 215, 147,  85, 187,
+                                              96 , 249 ,  59, 116, 150,  44, 167, 128,
+                                              34 , 217 , 148, 193, 243,  38, 250, 208,
+                                             112 , 130 , 208,  29,  16, 122,  20,  92,
+                                              24 ,  72 , 104,  29, 150, 233, 151,  19,
+                                             158 , 192 , 254,  70,  73, 142, 106, 152,
+                                               3 ,  61 ,  24, 135, 212,   9,  80, 234,
+                                             147 , 246 ,  83, 249,  49,  14,  68,  50});
+
+    fill_tensor(expected_output, std::vector<uint8_t> {131, 128,  128,  128,  128,  180,  129,  133,
+                                                       136, 128,  126,  128,  128,  173,  135,  130,
+                                                       160, 128,  128,  128,  128,  138,  132,  129,
+                                                       131, 128,  127,  128,  128,  169,  129,  131,
+                                                       133, 128,  128,  128,  128,  182,  130,  129,
+                                                       131, 128,  128,  128,  128,  163,  129,  130,
+                                                       131, 128,  128,  128,  128,  149,  132,  129,
+                                                       143, 128,  127,  128,  128,  150,  134,  131,
+                                                       134, 128,  128,  128,  128,  167,  130,  130,
+                                                       131, 128,  128,  128,  128,  152,  132,  129,
+                                                       128, 128,  128,  128,  128,  169,  130,  130,
+                                                       173, 128,  128,  128,  128,  148,  139,  130,
+                                                       152, 128,  128,  128,  128,  168,  139,  132,
+                                                       147, 128,  128,  128,  128,  161,  131,  132,
+                                                       130, 128,  128,  128,  128,  159,  134,  128,
+                                                       140, 128,  128,  128,  128,  133,  132,  128 });
+
+    lstmq.run();
+    validate(Accessor(output_state), expected_output);
+
+    // Second input
+    fill_tensor(expected_output, std::vector<uint8_t> { 130,   128,   128,   128,   128,   205,   129,   137,
+                                                        135,   128,   127,   128,   128,   190,   137,   132,
+                                                        160,   128,   128,   128,   128,   142,   133,   131,
+                                                        130,   128,   128,   128,   128,   185,   129,   133,
+                                                        132,   128,   128,   128,   128,   198,   131,   130,
+                                                        130,   128,   128,   128,   128,   178,   130,   131,
+                                                        131,   128,   128,   128,   128,   158,   132,   131,
+                                                        142,   128,   127,   128,   128,   158,   135,   134,
+                                                        133,   128,   128,   128,   128,   178,   131,   132,
+                                                        131,   128,   128,   128,   128,   160,   132,   130,
+                                                        128,   128,   128,   128,   128,   190,   131,   131,
+                                                        170,   128,   128,   128,   128,   157,   142,   131,
+                                                        149,   128,   128,   128,   128,   178,   142,   135,
+                                                        145,   128,   128,   128,   129,   173,   132,   135,
+                                                        129,   128,   128,   128,   128,   171,   134,   129,
+                                                        140,   128,   128,   128,   128,   135,   132,   129});
+    lstmq.run();
+    validate(Accessor(output_state), expected_output);
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END() // LSTMLayerQuantized
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
-- 
cgit v1.2.1