aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2021-06-17 17:18:45 +0100
committerManuel Bottini <manuel.bottini@arm.com>2021-06-22 17:03:54 +0000
commitae58bdf3b58739e105a24e3640d0245e81cea5ee (patch)
treee993b8768c3eff364a7c706db411c799fa86bfe0 /arm_compute
parent2db3a9955ef22be4be8ccd5a45bc0973ef80e42a (diff)
downloadComputeLibrary-ae58bdf3b58739e105a24e3640d0245e81cea5ee.tar.gz
Port NEGEMMLowp Part 1
Details: Port NEGEMMLowpQuantizeDownInt32ScaleKernel to CpuGemmLowpQuantizeDownInt32ScaleKernel Port NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel Port NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel Port NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel Port NEGEMMLowpOutputStage functions to CpuGemmLowpOutputStage operators Partially Resolves: COMPMID-4403 Change-Id: I6d5f45e43f35d731d564ed3b5c0e804d2a318fb1 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5833 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h237
-rw-r--r--arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h50
-rw-r--r--arm_compute/runtime/NEON/functions/NEQLSTMLayer.h2
5 files changed, 40 insertions, 253 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 9727e108a5..d2cd60e576 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -80,7 +80,7 @@ private:
* -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
* -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
* -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is not equal to nullptr)
+ * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
*
* @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
*/
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e89eae1d31..edb58e956a 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -155,7 +155,7 @@ private:
* -# @ref NEIm2ColKernel
* -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
* -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref NEGEMMLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
* -# @ref NEArithmeticAddition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
* -# @ref NECol2ImKernel (if NCHW data layout)
*
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index fa5f5e3826..232344e5c2 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -25,7 +25,7 @@
#define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
/** This file contains all available output stages for GEMMLowp.
*
@@ -39,237 +39,17 @@ namespace arm_compute
{
class ITensor;
class ITensorInfo;
-
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint.
- *
- * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
- /** Default destructor */
- ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint();
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint.
- *
- * NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
- /** Default destructor */
- ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint();
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
- int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
- *
- * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint.
- *
- * NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- * result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- * This function calls the following kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
- /** Default destructor */
- ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint();
- /** Initialise the kernel's inputs, output
- *
- * @param[in] input Input tensor. Data type supported: S32
- * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[out] output Output tensor. Data type supported: Data type supported: QSYMM16
- * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
- * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- */
- void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
- int max = std::numeric_limits<int32_t>::max());
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
- *
- * @param[in] input Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
- * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
- * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
- * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
- * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
- * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
- * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
/** Basic function to execute GEMMLowpQuantizeDown kernels.
*
- * This function calls the following kernels:
+ * This function calls the following operators:
*
- * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ * -# @ref cpu::CpuGemmLowpOutputStage
*/
-class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder
+class NEGEMMLowpOutputStage : public IFunction
{
public:
/** Constructor */
- NEGEMMLowpOutputStage() = default;
+ NEGEMMLowpOutputStage();
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -310,6 +90,13 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 2f0c753691..bcb89d997d 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -50,7 +50,7 @@ class ITensor;
* This function calls the following functions/kernels:
*
* -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
+ * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
* -# @ref NETranspose Matrix transpose
* -# @ref NEConcatenateLayer Tensor concatenation
* -# @ref NEActivationLayer Activation functions (tanh and logistic)
@@ -147,30 +147,30 @@ private:
MemoryGroup _memory_group;
// Functions used
- NEGEMMLowpMatrixMultiplyCore _gemmlowp;
- NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
- NETranspose _transpose_weights;
- NEConcatenateLayer _concat_input_weights;
- NEConcatenateLayer _concat_recurrent_weights;
- NEConcatenateLayer _concat_weights;
- NEConcatenateLayer _concat_inputs;
- NEConcatenateLayer _concat_bias;
- NEActivationLayer _sigmoid_forget_gate;
- NEActivationLayer _sigmoid_input_gate;
- NEActivationLayer _sigmoid_output_gate;
- NEActivationLayer _tanh_modulation_gate;
- NEActivationLayer _tanh_output_state;
- NEArithmeticAddition _add1;
- NEArithmeticAddition _add2;
- NEPixelWiseMultiplication _mul1;
- NEPixelWiseMultiplication _mul2;
- NEPixelWiseMultiplication _mul3;
- NESlice _slice_input_tensor;
- NESlice _slice_forget_tensor;
- NESlice _slice_cell_tensor;
- NESlice _slice_output_tensor;
- NEDequantizationLayer _dequantize;
- NEQuantizationLayer _quantize;
+ NEGEMMLowpMatrixMultiplyCore _gemmlowp;
+ NEGEMMLowpOutputStage _output_stage;
+ NETranspose _transpose_weights;
+ NEConcatenateLayer _concat_input_weights;
+ NEConcatenateLayer _concat_recurrent_weights;
+ NEConcatenateLayer _concat_weights;
+ NEConcatenateLayer _concat_inputs;
+ NEConcatenateLayer _concat_bias;
+ NEActivationLayer _sigmoid_forget_gate;
+ NEActivationLayer _sigmoid_input_gate;
+ NEActivationLayer _sigmoid_output_gate;
+ NEActivationLayer _tanh_modulation_gate;
+ NEActivationLayer _tanh_output_state;
+ NEArithmeticAddition _add1;
+ NEArithmeticAddition _add2;
+ NEPixelWiseMultiplication _mul1;
+ NEPixelWiseMultiplication _mul2;
+ NEPixelWiseMultiplication _mul3;
+ NESlice _slice_input_tensor;
+ NESlice _slice_forget_tensor;
+ NESlice _slice_cell_tensor;
+ NESlice _slice_output_tensor;
+ NEDequantizationLayer _dequantize;
+ NEQuantizationLayer _quantize;
// Tensor pointers
const ITensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 7c2e9bc5a1..77adffd543 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -54,7 +54,7 @@ class NEGEMMLowpMatrixAReductionKernel;
* -# @ref NEArithmeticSubtraction Elementwise subtraction
* -# @ref NECopy Copy kernel for copying output_state_out to output
* -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16
+ * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16
* -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use
* -# @ref NEPixelWiseMultiplication Elementwise multiplication
* -# @ref NETranspose Transpose function for reshaping the weights