From 10c53f1ef317095ddcd9143bf759cc68ecb0e721 Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Wed, 17 Jul 2019 16:11:53 +0100
Subject: COMPMID-2307: QUANTIZED_16BIT_LSTM operator for CL

Change-Id: I1b52df359f1a368d585fac43a08496544dd2f86f
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1568
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../core/CL/kernels/CLDequantizationLayerKernel.h  |   4 +-
 arm_compute/core/CL/kernels/CLStridedSliceKernel.h |   6 +-
 arm_compute/core/QuantizationInfo.h                |  12 +
 arm_compute/runtime/CL/CLFunctions.h               |   1 +
 .../runtime/CL/functions/CLConcatenateLayer.h      |  10 +-
 .../runtime/CL/functions/CLDequantizationLayer.h   |   4 +-
 .../runtime/CL/functions/CLLSTMLayerQuantized.h    | 203 +++++++++
 arm_compute/runtime/CL/functions/CLStridedSlice.h  |   6 +-
 .../runtime/NEON/functions/NELSTMLayerQuantized.h  |   2 +-
 .../CL/kernels/CLDequantizationLayerKernel.cpp     |   7 +-
 src/core/CL/kernels/CLStridedSliceKernel.cpp       |   2 +-
 .../kernels/CLWidthConcatenate4TensorsKernel.cpp   |   2 +-
 .../NEON/kernels/NEDequantizationLayerKernel.cpp   |   2 +-
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |  28 +-
 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp  | 397 ++++++++++++++++++
 .../NEON/functions/NELSTMLayerQuantized.cpp        |   8 +-
 tests/datasets/DatatypeDataset.h                   |   1 +
 tests/validation/CL/BatchConcatenateLayer.cpp      |   9 +-
 tests/validation/CL/DepthConcatenateLayer.cpp      |   9 +-
 tests/validation/CL/LSTMLayerQuantized.cpp         | 458 +++++++++++++++++++++
 tests/validation/CL/WidthConcatenateLayer.cpp      |   9 +-
 tests/validation/NEON/LSTMLayerQuantized.cpp       |   6 +-
 .../fixtures/DequantizationLayerFixture.h          |  24 +-
 tests/validation/reference/DequantizationLayer.cpp |  11 +-
 24 files changed, 1176 insertions(+), 45 deletions(-)
 create mode 100644 arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
 create mode 100644 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
 create mode 100644 tests/validation/CL/LSTMLayerQuantized.cpp

diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
index 6d37f6a1a5..0ee5a13638 100644
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
@@ -48,13 +48,13 @@ public:
     ~CLDequantizationLayerKernel() = default;
     /** Set the input, output, min and max.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor. Data types supported: F16/F32.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data types supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
index e104dcfdd7..5b69b3fd16 100644
--- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@ public:
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in]  input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+     * @param[in]  input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32
      * @param[out] output           Destination tensor. Data type supported: Same as @p input
      * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -72,7 +72,7 @@ public:
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in] input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+     * @param[in] input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32
      * @param[in] output           Destination tensor. Data type supported: Same as @p input
      * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 587a380d63..79afca0714 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -300,6 +300,18 @@ inline float dequantize(int8_t value, float scale)
     return value * scale;
 }
 
+/** Dequantize a value given a symmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] scale Scale to use for dequantization
+ *
+ * @return Dequantized value
+ */
+inline float dequantize(int16_t value, float scale)
+{
+    return value * scale;
+}
+
 /** Quantize a value given a 16-bit symmetric quantization scheme
  *
  * @param[in] value           Value to quantize
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 8c154f2059..922fb6acd9 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -94,6 +94,7 @@
 #include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 #include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index b69930c7d3..fb9724d167 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -60,7 +60,8 @@ public:
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
+    void configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
+    void configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
@@ -73,11 +74,18 @@ public:
      * @return a status
      */
     static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
+    template <typename TensorType>
+    void configure_internal(std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis);
+
+    template <typename TensorInfoType>
+    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis);
+
     std::vector<std::unique_ptr<ICLKernel>> _concat_kernels;
     unsigned int                            _num_inputs;
     unsigned int                            _axis;
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index 2f7af01a84..ade589d79e 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -40,13 +40,13 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.
-     *                    Data types supported: QASYMM8/QSYMM8.
+     *                    Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
new file mode 100644
index 0000000000..e2d164c395
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLSTMLAYERQUANTIZED_H__
+#define __ARM_COMPUTE_CLLSTMLAYERQUANTIZED_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/runtime/common/LSTMParams.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+
+/** Basic function to run @ref CLLSTMLayerQuantized
+ *
+ * This function calls the following CL functions/kernels:
+ *
+ * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
+ * -# @ref CLTranspose                                           Matrix transpose
+ * -# @ref CLConcatenateLayer                                    Tensor concatenation
+ * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
+ * -# @ref CLArithmeticAddition                                  Elementwise addition
+ * -# @ref CLPixelWiseMultiplication                             Elementwise multiplication
+ * -# @ref CLSlice                                               Tensor slicing
+ * -# @ref CLDequantizationLayer                                 Dequantize into float
+ * -# @ref CLQuantizationLayer                                   Quantize from float
+ * */
+class CLLSTMLayerQuantized : public IFunction
+{
+public:
+    /** Default constructor */
+    CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLSTMLayerQuantized(const CLLSTMLayerQuantized &) = delete;
+    /** Default move constructor */
+    CLLSTMLayerQuantized(CLLSTMLayerQuantized &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLSTMLayerQuantized &operator=(const CLLSTMLayerQuantized &) = delete;
+    /** Default move assignment operator */
+    CLLSTMLayerQuantized &operator=(CLLSTMLayerQuantized &&) = default;
+    /** Initialize function's tensors.
+     *
+     * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
+     * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_output_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_input_weights  2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_cell_weights   2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_output_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_gate_bias             1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  forget_gate_bias            1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_bias                   1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  output_gate_bias            1D weights tensor with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_state_in               2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[in]  output_state_in             2D tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
+     * @param[out] cell_state_out              Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
+     */
+    void configure(const ICLTensor *input,
+                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized
+     *
+     * @param[in]  input                       Source tensor info. Input is a 2D tensor info with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
+     * @param[in]  input_to_input_weights      2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_forget_weights     2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_cell_weights       2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_to_output_weights     2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_input_weights  2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_forget_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_cell_weights   2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  recurrent_to_output_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input.
+     * @param[in]  input_gate_bias             1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  forget_gate_bias            1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_bias                   1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  output_gate_bias            1D weights tensor info with dimensions [output_size]. Data type supported: S32.
+     * @param[in]  cell_state_in               2D tensor info with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[in]  output_state_in             2D tensor info with dimensions [output_size, batch_size]. Data type supported: Same as @p input.
+     * @param[out] cell_state_out              Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
+     * @param[out] output_state_out            Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size].Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    CLMemoryGroup _memory_group;
+
+    // Functions used
+    CLGEMMLowpMatrixMultiplyCore                        _gemmlowp;
+    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
+    CLTranspose                                         _transpose_weights;
+    CLConcatenateLayer                                  _concat_input_weights;
+    CLConcatenateLayer                                  _concat_recurrent_weights;
+    CLConcatenateLayer                                  _concat_weights;
+    CLConcatenateLayer                                  _concat_inputs;
+    CLConcatenateLayer                                  _concat_bias;
+    CLActivationLayer                                   _sigmoid_forget_gate;
+    CLActivationLayer                                   _sigmoid_input_gate;
+    CLActivationLayer                                   _sigmoid_output_gate;
+    CLActivationLayer                                   _tanh_modulation_gate;
+    CLActivationLayer                                   _tanh_output_state;
+    CLArithmeticAddition                                _add_cell_state_tmps;
+    CLArithmeticAddition                                _add2;
+    CLPixelWiseMultiplication                           _mul_forget_gate_cell_state;
+    CLPixelWiseMultiplication                           _mul_input_gate_input_mod_gate;
+    CLPixelWiseMultiplication                           _mul_output_state_tmp_output_gate;
+    CLSlice                                             _slice_input_tensor;
+    CLSlice                                             _slice_forget_tensor;
+    CLSlice                                             _slice_cell_tensor;
+    CLSlice                                             _slice_output_tensor;
+    CLDequantizationLayer                               _dequantize;
+    CLQuantizationLayer                                 _quantize;
+
+    // Tensor pointers
+    const ICLTensor *_input_to_input_weights;
+    const ICLTensor *_input_to_forget_weights;
+    const ICLTensor *_input_to_cell_weights;
+    const ICLTensor *_input_to_output_weights;
+    const ICLTensor *_recurrent_to_input_weights;
+    const ICLTensor *_recurrent_to_forget_weights;
+    const ICLTensor *_recurrent_to_cell_weights;
+    const ICLTensor *_recurrent_to_output_weights;
+    const ICLTensor *_input_gate_bias;
+    const ICLTensor *_forget_gate_bias;
+    const ICLTensor *_cell_bias;
+    const ICLTensor *_output_gate_bias;
+
+    // Temporary tensors
+    CLTensor _recurrent_weights;
+    CLTensor _input_weights;
+    CLTensor _weights;
+    CLTensor _input;
+    CLTensor _weights_transposed;
+    CLTensor _output_highp;
+    CLTensor _output_lowp;
+    CLTensor _bias;
+    CLTensor _forget_gate_input;
+    CLTensor _input_gate_input;
+    CLTensor _output_gate_input;
+    CLTensor _input_modulation_gate_input;
+    CLTensor _forget_gate_output;
+    CLTensor _input_gate_output;
+    CLTensor _output_gate_output;
+    CLTensor _input_modulation_gate_output;
+    CLTensor _cell_state_tmp1;
+    CLTensor _cell_state_tmp2;
+    CLTensor _output_state_tmp;
+    CLTensor _output_state_out_symm;
+    CLTensor _output_state_out_f32;
+
+    bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLLSTMLAYERQUANTIZED_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index 4a336f6fdc..bb97b17fea 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@ public:
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in]  input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+     * @param[in]  input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32
      * @param[out] output           Destination tensor. Data type supported: Same as @p input
      * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -57,7 +57,7 @@ public:
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in] input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+     * @param[in] input            Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32
      * @param[in] output           Destination tensor. Data type supported: Same as @p input
      * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index b45d714990..7f02988c19 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -53,7 +53,7 @@ class ITensor;
  * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
  * -# @ref NETranspose                                           Matrix transpose
  * -# @ref NEConcatenateLayer                                    Tensor concatenation
- * -# @ref NEActivationLayer                                     Activation functions (tanh and logistig)
+ * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
  * -# @ref NEArithmeticAddition                                  Elementwise addition
  * -# @ref NEPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref NESlice                                               Tensor slicing
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index e383bc475d..12d36cdb9f 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -33,14 +33,14 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QSYMM16);
 
     if(output->tensor_shape().total_size() > 0)
     {
@@ -135,3 +135,4 @@ void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &qu
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index c2bdf7f299..9dd488b678 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -48,7 +48,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
                                                          DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
+                                                         DataType::U16, DataType::S16, DataType::QSYMM16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
 
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index a3ac102564..4e673a9f38 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -84,7 +84,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
-                                                         DataType::F32);
+                                                         DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
 
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index d11f04a82f..e52f53ea04 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -194,7 +194,7 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind
         // Compute left-over elements
         for(; x < window_end_x; ++x)
         {
-            uint8_t val    = *(in_ptr + x);
+            int8_t val     = *(in_ptr + x);
             *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
         }
     },
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 1d396f5ebf..5d224db8e9 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -47,14 +47,35 @@ CLConcatenateLayer::CLConcatenateLayer()
 {
 }
 
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+template <typename TensorType>
+void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     _axis       = axis;
     _num_inputs = inputs_vector.size();
 
     std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
-    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
+    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(t);
         return t->info();
@@ -141,7 +162,8 @@ void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector
     }
 }
 
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+template <typename TensorInfoType>
+Status CLConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
     const unsigned int num_inputs = inputs_vector.size();
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
new file mode 100644
index 0000000000..e0006a77d0
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+// Quantization info structures used in the LSTMQuantize layer
+const QuantizationInfo qasymm(1.f / 128.f, 128);
+const QuantizationInfo qsymm_3(8.f / 32768.f, 0);  // qsymm16 with 3 integer bit
+const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
+const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
+} // namespace
+
+CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
+      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
+      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
+      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
+      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
+      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+{
+}
+
+void CLLSTMLayerQuantized::configure(const ICLTensor *input,
+                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+                                                              input_to_output_weights->info(),
+                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+    const int input_size  = input->info()->dimension(0);
+    const int batch_size  = input->info()->dimension(1);
+    const int output_size = input_to_input_weights->info()->dimension(1);
+
+    const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
+
+    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+
+    _input_to_input_weights      = input_to_input_weights;
+    _input_to_forget_weights     = input_to_forget_weights;
+    _input_to_cell_weights       = input_to_cell_weights;
+    _input_to_output_weights     = input_to_output_weights;
+    _recurrent_to_input_weights  = recurrent_to_input_weights;
+    _recurrent_to_forget_weights = recurrent_to_forget_weights;
+    _recurrent_to_cell_weights   = recurrent_to_cell_weights;
+    _recurrent_to_output_weights = recurrent_to_output_weights;
+    _input_gate_bias             = input_gate_bias;
+    _forget_gate_bias            = forget_gate_bias;
+    _cell_bias                   = cell_bias;
+    _output_gate_bias            = output_gate_bias;
+
+    // Weights concatenation
+    std::vector<const ICLTensor *> inputs_weights_vector;
+    inputs_weights_vector.emplace_back(input_to_input_weights);
+    inputs_weights_vector.emplace_back(input_to_forget_weights);
+    inputs_weights_vector.emplace_back(input_to_cell_weights);
+    inputs_weights_vector.emplace_back(input_to_output_weights);
+
+    std::vector<const ICLTensor *> recurrent_weights_vector;
+    recurrent_weights_vector.emplace_back(recurrent_to_input_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_forget_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
+
+    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
+
+    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
+
+    std::vector<const ICLTensor *> weights_vector;
+    weights_vector.emplace_back(&_recurrent_weights);
+    weights_vector.emplace_back(&_input_weights);
+
+    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_weights.configure(weights_vector, &_weights, Window::DimX);
+    _transpose_weights.configure(&_weights, &_weights_transposed);
+
+    // Input concatenation
+    std::vector<const ICLTensor *> input_vector;
+    input_vector.emplace_back(input);
+    input_vector.emplace_back(output_state_in);
+
+    _memory_group.manage(&_input);
+    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _concat_inputs.configure(input_vector, &_input, Window::DimX);
+
+    // Bias concatenation
+    std::vector<const ICLTensor *> bias_vector;
+    bias_vector.emplace_back(input_gate_bias);
+    bias_vector.emplace_back(forget_gate_bias);
+    bias_vector.emplace_back(cell_bias);
+    bias_vector.emplace_back(output_gate_bias);
+
+    _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
+    _concat_bias.configure(bias_vector, &_bias, Window::DimX);
+
+    // Invert the offset for gemmlowp
+    _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+
+    // Run gemmlowp
+    _memory_group.manage(&_output_highp);
+    _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32));
+    _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp);
+    _input.allocator()->allocate();
+
+    // Set the offset back
+    _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+
+    // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
+    _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
+
+    const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+    _memory_group.manage(&_output_lowp);
+    _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+    _output_highp.allocator()->allocate();
+    _bias.allocator()->allocate();
+
+    // Get the gate tensors
+    _memory_group.manage(&_input_gate_input);
+    _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+    _memory_group.manage(&_forget_gate_input);
+    _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+    _memory_group.manage(&_input_modulation_gate_input);
+    _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+    _memory_group.manage(&_output_gate_input);
+    _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+    _output_lowp.allocator()->allocate();
+
+    // Forget gate
+    _memory_group.manage(&_forget_gate_output);
+    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_input.allocator()->allocate();
+
+    // Input gate
+    _memory_group.manage(&_input_gate_output);
+    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_input.allocator()->allocate();
+
+    // Input modulation gate equation
+    _memory_group.manage(&_input_modulation_gate_output);
+    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_input.allocator()->allocate();
+
+    // Output gate
+    _memory_group.manage(&_output_gate_output);
+    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_input.allocator()->allocate();
+
+    // Long term memory
+    _memory_group.manage(&_cell_state_tmp1);
+    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _forget_gate_output.allocator()->allocate();
+
+    _memory_group.manage(&_cell_state_tmp2);
+    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _input_modulation_gate_output.allocator()->allocate();
+    _input_gate_output.allocator()->allocate();
+
+    _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _cell_state_tmp1.allocator()->allocate();
+    _cell_state_tmp2.allocator()->allocate();
+
+    // Short term memory
+    _memory_group.manage(&_output_state_tmp);
+    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+
+    _memory_group.manage(&_output_state_out_symm);
+    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_gate_output.allocator()->allocate();
+    _output_state_tmp.allocator()->allocate();
+
+    // Requantize the output state from QSYMM16 to QASYMM8
+    _memory_group.manage(&_output_state_out_f32);
+    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
+    _output_state_out_symm.allocator()->allocate();
+
+    _quantize.configure(&_output_state_out_f32, output_state_out);
+    _output_state_out_f32.allocator()->allocate();
+}
+
+Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
+                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+                                        output_state_in, cell_state_out, output_state_out);
+
+    const int input_size  = input->dimension(0);
+    const int batch_size  = input->dimension(1);
+    const int output_size = input_to_input_weights->dimension(1);
+
+    // Dimensionality checks
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_input_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+
+    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+
+    // Shape checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
+
+    // Data type checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
+
+    // Quantization checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
+
+    if(cell_state_out->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
+    }
+
+    if(output_state_out->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_out);
+    }
+
+    return Status{};
+}
+
+void CLLSTMLayerQuantized::run()
+{
+    prepare();
+
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Concat and transpose the input
+    _concat_inputs.run();
+
+    // Run gemmlowp
+    _gemmlowp.run();
+    _output_stage.run();
+
+    // Slice the results
+    _slice_input_tensor.run();
+    _slice_forget_tensor.run();
+    _slice_cell_tensor.run();
+    _slice_output_tensor.run();
+
+    // Gates
+    // Forget gate
+    _sigmoid_forget_gate.run();
+
+    // Input gate
+    _sigmoid_input_gate.run();
+
+    // Input modulation gate
+    _tanh_modulation_gate.run();
+
+    // Output gate
+    _sigmoid_output_gate.run();
+
+    // Cell state (long term memory)
+    _mul_forget_gate_cell_state.run();
+    _mul_input_gate_input_mod_gate.run();
+    _add_cell_state_tmps.run();
+
+    // Output state (short term memory)
+    _tanh_output_state.run();
+    _mul_output_state_tmp_output_gate.run();
+
+    // Requantize output state from QSYMM16 to QASYMM16
+    _dequantize.run();
+    _quantize.run();
+}
+
+void CLLSTMLayerQuantized::prepare()
+{
+    if(!_is_prepared)
+    {
+        _input_weights.allocator()->allocate();
+        _concat_input_weights.run();
+
+        _input_to_input_weights->mark_as_unused();
+        _input_to_forget_weights->mark_as_unused();
+        _input_to_cell_weights->mark_as_unused();
+        _input_to_output_weights->mark_as_unused();
+
+        _recurrent_weights.allocator()->allocate();
+        _concat_recurrent_weights.run();
+        _recurrent_to_input_weights->mark_as_unused();
+        _recurrent_to_forget_weights->mark_as_unused();
+        _recurrent_to_cell_weights->mark_as_unused();
+        _recurrent_to_output_weights->mark_as_unused();
+
+        _weights.allocator()->allocate();
+        _concat_weights.run();
+
+        _input_weights.mark_as_unused();
+        _input_weights.allocator()->free();
+        _recurrent_weights.mark_as_unused();
+        _recurrent_weights.allocator()->free();
+
+        _weights_transposed.allocator()->allocate();
+        _transpose_weights.run();
+
+        _weights.mark_as_unused();
+        _weights.allocator()->free();
+
+        _bias.allocator()->allocate();
+        _concat_bias.run();
+        _input_gate_bias->mark_as_unused();
+        _forget_gate_bias->mark_as_unused();
+        _cell_bias->mark_as_unused();
+        _output_gate_bias->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 05e05a5e57..6cfa9887ff 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -240,7 +240,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
     TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
     TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
     TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
     TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
@@ -254,14 +254,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // Data type checks
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
diff --git a/tests/datasets/DatatypeDataset.h b/tests/datasets/DatatypeDataset.h
index bb2774b4b3..a158a5f52d 100644
--- a/tests/datasets/DatatypeDataset.h
+++ b/tests/datasets/DatatypeDataset.h
@@ -43,6 +43,7 @@ public:
     {
         DataType::QSYMM8,
                  DataType::QASYMM8,
+                 DataType::QSYMM16,
     })
     {
     }
diff --git a/tests/validation/CL/BatchConcatenateLayer.cpp b/tests/validation/CL/BatchConcatenateLayer.cpp
index b789569155..6c4ffee1dc 100644
--- a/tests/validation/CL/BatchConcatenateLayer.cpp
+++ b/tests/validation/CL/BatchConcatenateLayer.cpp
@@ -97,9 +97,12 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    CLConcatenateLayer concat_layer;
-
-    concat_layer.configure({ &src1, &src2, &src3 }, &dst, 3);
+    CLConcatenateLayer       concat_layer;
+    std::vector<ICLTensor *> inputs;
+    inputs.emplace_back(&src1);
+    inputs.emplace_back(&src2);
+    inputs.emplace_back(&src3);
+    concat_layer.configure(inputs, &dst, 3);
 }
 template <typename T>
 using CLBatchConcatenateLayerFixture = ConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLConcatenateLayer, T>;
diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp
index 8cbfda382b..c67ed05ecd 100644
--- a/tests/validation/CL/DepthConcatenateLayer.cpp
+++ b/tests/validation/CL/DepthConcatenateLayer.cpp
@@ -94,9 +94,12 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    CLConcatenateLayer concat_layer;
-
-    concat_layer.configure({ &src1, &src2, &src3 }, &dst, 2);
+    CLConcatenateLayer       concat_layer;
+    std::vector<ICLTensor *> inputs;
+    inputs.emplace_back(&src1);
+    inputs.emplace_back(&src2);
+    inputs.emplace_back(&src3);
+    concat_layer.configure(inputs, &dst, 2);
 }
 template <typename T>
 using CLDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLConcatenateLayer, T>;
diff --git a/tests/validation/CL/LSTMLayerQuantized.cpp b/tests/validation/CL/LSTMLayerQuantized.cpp
new file mode 100644
index 0000000000..1fc0af1ecb
--- /dev/null
+++ b/tests/validation/CL/LSTMLayerQuantized.cpp
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/Utils.h"
+#include "tests/datasets/LSTMLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename T>
+inline void fill_tensor(CLTensor &tensor, const std::vector<T> &v)
+{
+    tensor.map(true);
+    // Import memory accounting for padding
+    TensorShape t_shape = tensor.info()->tensor_shape();
+    Window      window;
+    window.use_tensor_dimensions(t_shape);
+    Iterator out(&tensor, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        *reinterpret_cast<T *>(out.ptr()) = v[coord2index(t_shape, id)];
+    },
+    out);
+    tensor.unmap();
+}
+
+template <typename T>
+inline void fill_tensor(SimpleTensor<T> &tensor, const std::vector<T> &v)
+{
+    std::memcpy(tensor.data(), v.data(), sizeof(T) * v.size());
+}
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(LSTMLayerQuantized)
+
+// *INDENT-OFF*
+// clang-format off
+TEST_CASE(IntegrationTestCaseSmall, framework::DatasetMode::PRECOMMIT)
+{
+    const int batch_size  = 2;
+    const int input_size  = 2;
+    const int output_size = 4;
+
+
+    QuantizationInfo qasymm(1.f / 128.f, 128);
+    QuantizationInfo qweights(1.f / 128.f, 128);
+    QuantizationInfo qsymm_3(8.f / 32768.f, 0);
+    QuantizationInfo qsymm_4(16.f / 32768.f, 0);
+
+    TensorShape input_shape{ input_size, batch_size };
+    TensorShape input_weights_shape{ input_size, output_size };
+    TensorShape recurrent_weights_shape{ output_size, output_size };
+    TensorShape output_shape{ output_size, batch_size};
+    TensorShape bias_shape{ output_size };
+
+    auto input_to_input_weights      = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_forget_weights     = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_cell_weights       = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_output_weights     = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_input_weights  = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_forget_weights = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_cell_weights   = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_output_weights = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_gate_bias             = create_tensor<CLTensor>(bias_shape, DataType::S32);
+    auto forget_gate_bias            = create_tensor<CLTensor>(bias_shape, DataType::S32);
+    auto cell_gate_bias              = create_tensor<CLTensor>(bias_shape, DataType::S32);
+    auto output_gate_bias            = create_tensor<CLTensor>(bias_shape, DataType::S32);
+
+    // LSTM input
+    auto input = create_tensor<CLTensor>(input_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM output state
+    auto output_state = create_tensor<CLTensor>(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM cell state
+    auto cell_state = create_tensor<CLTensor>(output_shape, DataType::QSYMM16, 1, qsymm_4);
+
+    CLLSTMLayerQuantized lstmq;
+
+    lstmq.configure(&input, &input_to_input_weights, &input_to_forget_weights, &input_to_cell_weights, &input_to_output_weights,
+                    &recurrent_to_input_weights, &recurrent_to_forget_weights, &recurrent_to_cell_weights, &recurrent_to_output_weights,
+                    &input_gate_bias, &forget_gate_bias, &cell_gate_bias, &output_gate_bias, &cell_state, &output_state, &cell_state, &output_state);
+
+    input.allocator()->allocate();
+    input_to_input_weights.allocator()->allocate();
+    input_to_forget_weights.allocator()->allocate();
+    input_to_cell_weights.allocator()->allocate();
+    input_to_output_weights.allocator()->allocate();
+    recurrent_to_input_weights.allocator()->allocate();
+    recurrent_to_forget_weights.allocator()->allocate();
+    recurrent_to_cell_weights.allocator()->allocate();
+    recurrent_to_output_weights.allocator()->allocate();
+    input_gate_bias.allocator()->allocate();
+    forget_gate_bias.allocator()->allocate();
+    cell_gate_bias.allocator()->allocate();
+    output_gate_bias.allocator()->allocate();
+    cell_state.allocator()->allocate();
+    output_state.allocator()->allocate();
+
+    // Fill weights and biases
+    fill_tensor(input_to_input_weights, std::vector<uint8_t>{ 47,  168,
+                                                              66,  239,
+                                                               6,   42,
+                                                             237,  236 });
+
+    fill_tensor(input_to_forget_weights, std::vector<uint8_t> { 204,  193,
+                                                                148,  59,
+                                                                113,  17,
+                                                                 66, 197 });
+
+    fill_tensor(input_to_cell_weights, std::vector<uint8_t> { 172,  101,
+                                                              184, 209,
+                                                              165,  82,
+                                                              108, 209 });
+
+    fill_tensor(input_to_output_weights, std::vector<uint8_t> { 203, 244,
+                                                                219, 114,
+                                                                130,  16,
+                                                                163, 222 });
+
+    fill_tensor(recurrent_to_input_weights, std::vector<uint8_t> { 162, 168,  7,  95,
+                                                                    91, 155, 108, 216,
+                                                                   255, 100,  48, 188,
+                                                                    58,  37, 186, 147 });
+
+    fill_tensor(recurrent_to_forget_weights, std::vector<uint8_t> {  46,  58,  47, 170,
+                                                                    246,  96,  12,  99,
+                                                                     68,  23, 186, 161,
+                                                                    237, 164,  89,   6 });
+
+    fill_tensor(recurrent_to_cell_weights, std::vector<uint8_t> { 234,  99,   71, 206,
+                                                                  205, 159,   64, 253,
+                                                                  191, 148,  116,   8,
+                                                                  209, 136,   59, 138 });
+
+    fill_tensor(recurrent_to_output_weights, std::vector<uint8_t> {  23, 241, 137, 36,
+                                                                    206,   5, 227, 56,
+                                                                    254, 176, 231, 47,
+                                                                     18, 201, 161, 11 });
+
+    fill_tensor(input_gate_bias, std::vector<int>  {-103038,   30525,  115255, -38154 });
+    fill_tensor(forget_gate_bias, std::vector<int> { -23428,  126970,  116806,  46307 });
+    fill_tensor(cell_gate_bias, std::vector<int>   { 128006,   69949,  -42808,  42568 });
+    fill_tensor(output_gate_bias, std::vector<int> { -67066,  -53607,   47233,  7300  });
+
+    SimpleTensor<uint8_t> expected_output(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // Initialize state
+    fill_tensor(output_state, std::vector<uint8_t> { 128, 128, 128, 128,
+                                                     128, 128, 128, 128 });
+    fill_tensor(cell_state, std::vector<int16_t> { 0, 0, 0, 0,
+                                                   0, 0, 0, 0 });
+
+    // First input
+    fill_tensor(input, std::vector<uint8_t> { 106,  193,
+                                              155,  150 });
+
+    fill_tensor(expected_output, std::vector<uint8_t> { 128, 130,  36, 134,
+                                                        128, 131,  35, 133 });
+
+    lstmq.run();
+    validate(CLAccessor(output_state), expected_output);
+
+    // Second input
+    fill_tensor(expected_output, std::vector<uint8_t> { 128, 129, 12, 137,
+                                                        128, 131, 10, 136 });
+    lstmq.run();
+    validate(CLAccessor(output_state), expected_output);
+
+    // Third input
+    fill_tensor(expected_output, std::vector<uint8_t> { 128, 129, 8, 140,
+                                                        128, 130, 6, 138 });
+    lstmq.run();
+    validate(CLAccessor(output_state), expected_output);
+}
+
+TEST_CASE(IntegrationTestCaseLarge, framework::DatasetMode::PRECOMMIT)
+{
+    const int batch_size  = 16;
+    const int input_size  = 8;
+    const int output_size = 8;
+
+
+    QuantizationInfo qasymm(1.f / 128.f, 128);
+    QuantizationInfo qweights(1.f / 128.f, 128);
+    QuantizationInfo qsymm_3(8.f / 32768.f, 0);
+    QuantizationInfo qsymm_4(16.f / 32768.f, 0);
+
+    TensorShape input_shape{ input_size, batch_size };
+    TensorShape input_weights_shape{ input_size, output_size };
+    TensorShape recurrent_weights_shape{ output_size, output_size };
+    TensorShape output_shape{ output_size, batch_size};
+    TensorShape bias_shape{ output_size };
+
+    auto input_to_input_weights      = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_forget_weights     = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_cell_weights       = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_to_output_weights     = create_tensor<CLTensor>(input_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_input_weights  = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_forget_weights = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_cell_weights   = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto recurrent_to_output_weights = create_tensor<CLTensor>(recurrent_weights_shape, DataType::QASYMM8, 1, qweights);
+    auto input_gate_bias             = create_tensor<CLTensor>(bias_shape, DataType::S32);
+    auto forget_gate_bias            = create_tensor<CLTensor>(bias_shape, DataType::S32);
+    auto cell_gate_bias              = create_tensor<CLTensor>(bias_shape, DataType::S32);
+    auto output_gate_bias            = create_tensor<CLTensor>(bias_shape, DataType::S32);
+
+    // LSTM input
+    auto input = create_tensor<CLTensor>(input_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM output state
+    auto output_state = create_tensor<CLTensor>(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // LSTM cell state
+    auto cell_state = create_tensor<CLTensor>(output_shape, DataType::QSYMM16, 1, qsymm_4);
+
+    CLLSTMLayerQuantized lstmq;
+
+    lstmq.configure(&input, &input_to_input_weights, &input_to_forget_weights, &input_to_cell_weights, &input_to_output_weights,
+                    &recurrent_to_input_weights, &recurrent_to_forget_weights, &recurrent_to_cell_weights, &recurrent_to_output_weights,
+                    &input_gate_bias, &forget_gate_bias, &cell_gate_bias, &output_gate_bias, &cell_state, &output_state, &cell_state, &output_state);
+
+    input.allocator()->allocate();
+    input_to_input_weights.allocator()->allocate();
+    input_to_forget_weights.allocator()->allocate();
+    input_to_cell_weights.allocator()->allocate();
+    input_to_output_weights.allocator()->allocate();
+    recurrent_to_input_weights.allocator()->allocate();
+    recurrent_to_forget_weights.allocator()->allocate();
+    recurrent_to_cell_weights.allocator()->allocate();
+    recurrent_to_output_weights.allocator()->allocate();
+    input_gate_bias.allocator()->allocate();
+    forget_gate_bias.allocator()->allocate();
+    cell_gate_bias.allocator()->allocate();
+    output_gate_bias.allocator()->allocate();
+    cell_state.allocator()->allocate();
+    output_state.allocator()->allocate();
+
+    // Fill weights and biases
+    fill_tensor(input_to_input_weights, std::vector<uint8_t>{ 141,  89, 200, 180,  46,  50,  87, 128,
+                                                              149, 227, 177, 187, 212, 229,  54, 111,
+                                                              131, 116,   3,  58, 196,  26, 131, 255,
+                                                               22, 106, 216,  69, 239,  12, 232, 207,
+                                                              184,  56, 236, 172,  28, 143, 161, 124,
+                                                              255,  33, 197, 122,  47, 197,  26, 229,
+                                                               91,  79,  11, 160,  26,  80, 100,  36,
+                                                              248, 186,  97,  61, 125,  46,  14, 100, });
+
+    fill_tensor(input_to_forget_weights, std::vector<uint8_t> { 237, 165, 141, 249,  72, 116, 36 , 115,
+                                                                234, 213,  85,  84,  59,  62, 150, 246,
+                                                                182, 102, 158, 214, 182, 183,  94,  11,
+                                                                158, 192,  92, 189, 160, 219, 206, 249,
+                                                                 88, 213, 193, 244, 151,  72, 129,  49,
+                                                                239,  83, 106,   9, 169, 187, 125, 171,
+                                                                 32, 141, 126,  92,  13,  36, 224, 150,
+                                                                187, 250, 178, 169,  89, 214,  91, 173 });
+
+    fill_tensor(input_to_cell_weights, std::vector<uint8_t> {  93, 103, 226, 139, 185, 252, 129, 171,
+                                                              159,  32,  25, 175, 224, 183, 165,  35,
+                                                              207,  69, 238, 228, 149, 214,  79,   6,
+                                                                5,  66, 102,  14,  19, 111,  36, 143,
+                                                               22,  85,  13,  78, 236, 121, 122,  77,
+                                                              249,  39,  88,  12, 205, 143,  93, 240,
+                                                              167,  89, 188,  50,  73,  69, 201, 251,
+                                                               59,  32, 203, 184, 139, 191, 199,  74});
+
+    fill_tensor(input_to_output_weights, std::vector<uint8_t> { 205,   7,  95, 104, 252, 143, 226,  73,
+                                                                229, 114, 152, 171, 221, 153,  73, 229,
+                                                                153, 165, 223, 239, 100,  38, 172, 211,
+                                                                226, 133, 239, 207, 116, 230, 170, 100,
+                                                                241,  95, 171, 124,  63, 115,  32, 127,
+                                                                141, 239,  53, 193, 201,  53, 104, 178,
+                                                                186, 212, 167, 107, 226, 230,  71, 213,
+                                                                148, 217,  19, 248, 233, 195, 183, 156 });
+
+    fill_tensor(recurrent_to_input_weights, std::vector<uint8_t> { 147, 112, 140, 103,   3, 255,  17,  49,
+                                                                    84, 112, 144, 213, 138, 142, 112,  66,
+                                                                   117,  30, 101,  35,  25, 132, 211, 229,
+                                                                   183, 208, 102,  16,  38,  85, 101, 152,
+                                                                   226,  83, 132,  22, 161, 110, 157, 129,
+                                                                   184,  63, 168,  42, 220, 126, 209, 157,
+                                                                     5,  88, 243,  83, 249,  19, 226, 209,
+                                                                   173,  96, 185,  77, 146, 227, 238, 136 });
+
+
+    fill_tensor(recurrent_to_forget_weights, std::vector<uint8_t> {  52, 132,  92, 200, 213,  32, 213,  37,
+                                                                    116, 142, 116, 180,   4, 172, 158, 143,
+                                                                    110,  40,  99,  28, 221, 153, 133,   2,
+                                                                    247, 144, 198, 100,  20,  15, 221, 196,
+                                                                    159, 178, 188, 151, 171,  15,  25, 217,
+                                                                    178, 109, 110, 118, 128,  39, 232, 234,
+                                                                    184, 214, 177,  13,  56,   6,  28, 252,
+                                                                     89, 187, 242,  59, 146, 111, 132, 129});
+
+    fill_tensor(recurrent_to_cell_weights, std::vector<uint8_t> {  70,  44, 137,  29,  36, 127,   1, 241,
+                                                                   26, 241, 142, 114,  67, 181,  49,  57,
+                                                                  131, 152, 175,  77,  23,  63,  37, 124,
+                                                                  150, 113,  95, 103, 110, 201,  69,  97,
+                                                                  196, 242,  62, 214,  66,  19,  45, 135,
+                                                                   22, 168, 149, 104,  77, 101,  36,  68,
+                                                                  170, 116, 222, 100, 109,   1, 154,  18,
+                                                                  133, 215, 105,  93,  31,  57, 231, 112 });
+
+
+    fill_tensor(recurrent_to_output_weights, std::vector<uint8_t> { 45 ,  181 ,  220 ,  219 ,   49  ,  63 ,   49  , 129,
+                                                                     7 ,  166 ,  104 ,  114 ,   83  ,  40 ,    1  , 195,
+                                                                   245 ,  142 ,   82 ,  232 ,  104  , 245 ,   82  , 196,
+                                                                   111 ,   56 ,  156 ,    9 ,  141  , 240 ,  180  , 148,
+                                                                   247 ,  198 ,  234 ,  137 ,   13  , 210 ,  161  , 192,
+                                                                   196 ,   59 ,  233 ,  184 ,  142  , 187 ,  140  , 166,
+                                                                     2 ,   95 ,  152 ,   46 ,   71  ,  46 ,  113  ,  32,
+                                                                   175 ,  229 ,   86 ,   87 ,   62  ,  93 ,   74  , 130});
+
+    fill_tensor(input_gate_bias, std::vector<int>  {  -40040, -106916,  -92315,  -79123,   45160, -17954,   50962, -63758 });
+    fill_tensor(forget_gate_bias, std::vector<int> { -128514,    8463,  -57831,  116977,  106547, -28132, -124557,  44941 });
+    fill_tensor(cell_gate_bias, std::vector<int>   { 88388  ,  123601, -116148,  -13022,   21619,  48926,   57523,  39332 });
+    fill_tensor(output_gate_bias, std::vector<int> {  59485 ,  -33070,   21386, -100633, -115959, 125768,  -56407,  24897 });
+
+    SimpleTensor<uint8_t> expected_output(output_shape, DataType::QASYMM8, 1, qasymm);
+
+    // Initialize state
+    fill_tensor(output_state, std::vector<uint8_t> { 128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128,
+                                                     128, 128, 128, 128, 128, 128, 128, 128 });
+
+    fill_tensor(cell_state, std::vector<int16_t> { 0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0,
+                                                   0, 0, 0, 0, 0, 0, 0, 0});
+
+    // First input
+    fill_tensor(input, std::vector<uint8_t> { 247,  203, 159, 131, 182, 114, 207, 195,
+                                              48 ,  61 , 154,  16,  80, 101, 116, 255,
+                                              50 , 115 ,  45, 186,  75, 212,  98,  48,
+                                              88 , 146 ,  24, 143, 218, 174, 203, 200,
+                                             239 ,  16 ,  66, 136, 234,  54,  94,  51,
+                                             101 , 128 , 220, 213, 164,  82, 137, 255,
+                                              70 , 165 , 234, 220,  66,  35, 183, 206,
+                                              39 ,  57 , 180, 202,  23, 172, 224, 109,
+                                             102 , 215 , 186,  82, 215, 147,  85, 187,
+                                              96 , 249 ,  59, 116, 150,  44, 167, 128,
+                                              34 , 217 , 148, 193, 243,  38, 250, 208,
+                                             112 , 130 , 208,  29,  16, 122,  20,  92,
+                                              24 ,  72 , 104,  29, 150, 233, 151,  19,
+                                             158 , 192 , 254,  70,  73, 142, 106, 152,
+                                               3 ,  61 ,  24, 135, 212,   9,  80, 234,
+                                             147 , 246 ,  83, 249,  49,  14,  68,  50});
+
+    fill_tensor(expected_output, std::vector<uint8_t> {131, 128,  128,  128,  128,  180,  129,  133,
+                                                       136, 128,  126,  128,  128,  173,  135,  130,
+                                                       160, 128,  128,  128,  128,  138,  132,  129,
+                                                       131, 128,  127,  128,  128,  169,  129,  131,
+                                                       133, 128,  128,  128,  128,  182,  130,  129,
+                                                       131, 128,  128,  128,  128,  163,  129,  130,
+                                                       131, 128,  128,  128,  128,  149,  132,  129,
+                                                       143, 128,  127,  128,  128,  150,  134,  131,
+                                                       134, 128,  128,  128,  128,  167,  130,  130,
+                                                       131, 128,  128,  128,  128,  152,  132,  129,
+                                                       128, 128,  128,  128,  128,  169,  130,  130,
+                                                       173, 128,  128,  128,  128,  148,  139,  130,
+                                                       152, 128,  128,  128,  128,  168,  139,  132,
+                                                       147, 128,  128,  128,  128,  161,  131,  132,
+                                                       130, 128,  128,  128,  128,  159,  134,  128,
+                                                       140, 128,  128,  128,  128,  133,  132,  128 });
+
+    lstmq.run();
+    validate(CLAccessor(output_state), expected_output);
+
+    // Second input
+    fill_tensor(expected_output, std::vector<uint8_t> { 130,   128,   128,   128,   128,   205,   129,   137,
+                                                        135,   128,   127,   128,   128,   190,   137,   132,
+                                                        160,   128,   128,   128,   128,   142,   133,   131,
+                                                        130,   128,   128,   128,   128,   185,   129,   133,
+                                                        132,   128,   128,   128,   128,   198,   131,   130,
+                                                        130,   128,   128,   128,   128,   178,   130,   131,
+                                                        131,   128,   128,   128,   128,   158,   132,   131,
+                                                        142,   128,   127,   128,   128,   158,   135,   134,
+                                                        133,   128,   128,   128,   128,   178,   131,   132,
+                                                        131,   128,   128,   128,   128,   160,   132,   130,
+                                                        128,   128,   128,   128,   128,   190,   131,   131,
+                                                        170,   128,   128,   128,   128,   157,   142,   131,
+                                                        149,   128,   128,   128,   128,   178,   142,   135,
+                                                        145,   128,   128,   128,   129,   173,   132,   135,
+                                                        129,   128,   128,   128,   128,   171,   134,   129,
+                                                        140,   128,   128,   128,   128,   135,   132,   129});
+    lstmq.run();
+    validate(CLAccessor(output_state), expected_output);
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END() // LSTMLayerQuantized
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp
index 52a4e4ccd6..7b894a63e0 100644
--- a/tests/validation/CL/WidthConcatenateLayer.cpp
+++ b/tests/validation/CL/WidthConcatenateLayer.cpp
@@ -98,9 +98,12 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    CLConcatenateLayer concat_layer;
-
-    concat_layer.configure({ &src1, &src2, &src3 }, &dst, 0);
+    CLConcatenateLayer       concat_layer;
+    std::vector<ICLTensor *> inputs;
+    inputs.emplace_back(&src1);
+    inputs.emplace_back(&src2);
+    inputs.emplace_back(&src3);
+    concat_layer.configure(inputs, &dst, 0);
 }
 
 template <typename T>
diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp
index 41c12c91e7..d5d036de33 100644
--- a/tests/validation/NEON/LSTMLayerQuantized.cpp
+++ b/tests/validation/NEON/LSTMLayerQuantized.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
+
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/Utils.h"
@@ -131,8 +131,6 @@ TEST_CASE(IntegrationTestCaseSmall, framework::DatasetMode::PRECOMMIT)
     output_gate_bias.allocator()->allocate();
     cell_state.allocator()->allocate();
     output_state.allocator()->allocate();
-    cell_state.allocator()->allocate();
-    output_state.allocator()->allocate();
 
     // Fill weights and biases
     fill_tensor(input_to_input_weights, std::vector<uint8_t>{ 47,  168,
@@ -452,7 +450,7 @@ TEST_CASE(IntegrationTestCaseLarge, framework::DatasetMode::PRECOMMIT)
 // *INDENT-ON*
 
 TEST_SUITE_END() // LSTMLayerQuantized
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/DequantizationLayerFixture.h b/tests/validation/fixtures/DequantizationLayerFixture.h
index 15f3711189..2c8f05746d 100644
--- a/tests/validation/fixtures/DequantizationLayerFixture.h
+++ b/tests/validation/fixtures/DequantizationLayerFixture.h
@@ -92,32 +92,46 @@ protected:
 
     SimpleTensor<T> compute_reference(const TensorShape &shape, DataType src_data_type)
     {
-        if(is_data_type_quantized_asymmetric(src_data_type))
+        if(src_data_type == DataType::QASYMM8)
         {
             SimpleTensor<uint8_t> src{ shape, src_data_type, 1, _quantization_info };
             fill(src);
             return reference::dequantization_layer<T>(src);
         }
-        else
+        else if(src_data_type == DataType::QSYMM8)
         {
             SimpleTensor<int8_t> src{ shape, src_data_type, 1, _quantization_info };
             fill(src);
             return reference::dequantization_layer<T>(src);
         }
+        else if(src_data_type == DataType::QSYMM16)
+        {
+            SimpleTensor<int16_t> src{ shape, src_data_type, 1, _quantization_info };
+            fill(src);
+            return reference::dequantization_layer<T>(src);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Unsupported data type");
+        }
     }
 
 protected:
     QuantizationInfo generate_quantization_info(DataType data_type)
     {
-        std::uniform_int_distribution<> distribution(1, 127);
         std::mt19937                    gen(library.get()->seed());
+        std::uniform_int_distribution<> distribution_scale_q8(1, 255);
+        std::uniform_int_distribution<> distribution_offset_q8(1, 127);
+        std::uniform_int_distribution<> distribution_scale_q16(1, 32768);
 
         switch(data_type)
         {
+            case DataType::QSYMM16:
+                return QuantizationInfo(1.f / distribution_scale_q16(gen));
             case DataType::QSYMM8:
-                return QuantizationInfo(1.f / distribution(gen));
+                return QuantizationInfo(1.f / distribution_scale_q8(gen));
             case DataType::QASYMM8:
-                return QuantizationInfo(1.f / distribution(gen), distribution(gen));
+                return QuantizationInfo(1.f / distribution_scale_q8(gen), distribution_offset_q8(gen));
             default:
                 ARM_COMPUTE_ERROR("Unsupported data type");
         }
diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp
index d07371c883..cceee0421c 100644
--- a/tests/validation/reference/DequantizationLayer.cpp
+++ b/tests/validation/reference/DequantizationLayer.cpp
@@ -45,6 +45,11 @@ TOut dequantize(uint8_t val, const UniformQuantizationInfo qinfo)
 {
     return static_cast<TOut>(dequantize_qasymm8(val, qinfo));
 }
+template <typename TOut>
+TOut dequantize(int16_t val, const UniformQuantizationInfo qinfo)
+{
+    return static_cast<TOut>(dequantize_qsymm16(val, qinfo));
+}
 
 template <typename TOut, typename TIn>
 SimpleTensor<TOut> dequantization_layer_nchw(const SimpleTensor<TIn> &src)
@@ -72,7 +77,7 @@ SimpleTensor<TOut> dequantization_layer_nchw(const SimpleTensor<TIn> &src)
                 // Dequantize slice
                 for(int s = 0; s < WH; ++s)
                 {
-                    dst[idx + s] = dequantize<TOut>(src[idx + s], channel_qinfo);
+                    dst[idx + s] = dequantize<TOut>(static_cast<TIn>(src[idx + s]), channel_qinfo);
                 }
             }
         }
@@ -84,7 +89,7 @@ SimpleTensor<TOut> dequantization_layer_nchw(const SimpleTensor<TIn> &src)
 
         for(int i = 0; i < src.num_elements(); ++i)
         {
-            dst[i] = static_cast<TOut>(dequantize<TOut>(src[i], quantization_info));
+            dst[i] = static_cast<TOut>(dequantize<TOut>(static_cast<TIn>(src[i]), quantization_info));
         }
     }
 
@@ -109,6 +114,8 @@ template SimpleTensor<half> dequantization_layer(const SimpleTensor<uint8_t> &sr
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<uint8_t> &src);
 template SimpleTensor<half> dequantization_layer(const SimpleTensor<int8_t> &src);
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int8_t> &src);
+template SimpleTensor<half> dequantization_layer(const SimpleTensor<int16_t> &src);
+template SimpleTensor<float> dequantization_layer(const SimpleTensor<int16_t> &src);
 } // namespace reference
 } // namespace validation
 } // namespace test
-- 
cgit v1.2.1