From 10c53f1ef317095ddcd9143bf759cc68ecb0e721 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 17 Jul 2019 16:11:53 +0100 Subject: COMPMID-2307: QUANTIZED_16BIT_LSTM operator for CL Change-Id: I1b52df359f1a368d585fac43a08496544dd2f86f Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/1568 Tested-by: Arm Jenkins Reviewed-by: Giuseppe Rossini Comments-Addressed: Arm Jenkins --- .../core/CL/kernels/CLDequantizationLayerKernel.h | 4 +- arm_compute/core/CL/kernels/CLStridedSliceKernel.h | 6 +- arm_compute/core/QuantizationInfo.h | 12 + arm_compute/runtime/CL/CLFunctions.h | 1 + .../runtime/CL/functions/CLConcatenateLayer.h | 10 +- .../runtime/CL/functions/CLDequantizationLayer.h | 4 +- .../runtime/CL/functions/CLLSTMLayerQuantized.h | 203 +++++++++ arm_compute/runtime/CL/functions/CLStridedSlice.h | 6 +- .../runtime/NEON/functions/NELSTMLayerQuantized.h | 2 +- .../CL/kernels/CLDequantizationLayerKernel.cpp | 7 +- src/core/CL/kernels/CLStridedSliceKernel.cpp | 2 +- .../kernels/CLWidthConcatenate4TensorsKernel.cpp | 2 +- .../NEON/kernels/NEDequantizationLayerKernel.cpp | 2 +- src/runtime/CL/functions/CLConcatenateLayer.cpp | 28 +- src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 397 ++++++++++++++++++ .../NEON/functions/NELSTMLayerQuantized.cpp | 8 +- tests/datasets/DatatypeDataset.h | 1 + tests/validation/CL/BatchConcatenateLayer.cpp | 9 +- tests/validation/CL/DepthConcatenateLayer.cpp | 9 +- tests/validation/CL/LSTMLayerQuantized.cpp | 458 +++++++++++++++++++++ tests/validation/CL/WidthConcatenateLayer.cpp | 9 +- tests/validation/NEON/LSTMLayerQuantized.cpp | 6 +- .../fixtures/DequantizationLayerFixture.h | 24 +- tests/validation/reference/DequantizationLayer.cpp | 11 +- 24 files changed, 1176 insertions(+), 45 deletions(-) create mode 100644 arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h create mode 100644 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp create mode 100644 tests/validation/CL/LSTMLayerQuantized.cpp diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h index 6d37f6a1a5..0ee5a13638 100644 --- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h @@ -48,13 +48,13 @@ public: ~CLDequantizationLayerKernel() = default; /** Set the input, output, min and max. * - * @param[in] input Source tensor. Data types supported: QASYMM8/QSYMM8. + * @param[in] input Source tensor. Data types supported: QASYMM8/QSYMM8/QSYMM16. * @param[out] output Destination tensor. Data types supported: F16/F32. */ void configure(const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel * - * @param[in] input Input tensor info. Data types supported: QASYMM8/QSYMM8. + * @param[in] input Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16. * @param[in] output Output tensor info. Data types supported: F16/F32. * * @return a status diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h index e104dcfdd7..5b69b3fd16 100644 --- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h +++ b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,7 +54,7 @@ public: * * @note Supported tensor rank: up to 4 * - * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). @@ -72,7 +72,7 @@ public: * * @note Supported tensor rank: up to 4 * - * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32 * @param[in] output Destination tensor. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h index 587a380d63..79afca0714 100644 --- a/arm_compute/core/QuantizationInfo.h +++ b/arm_compute/core/QuantizationInfo.h @@ -300,6 +300,18 @@ inline float dequantize(int8_t value, float scale) return value * scale; } +/** Dequantize a value given a symmetric quantization scheme + * + * @param[in] value Value to dequantize + * @param[in] scale Scale to use for dequantization + * + * @return Dequantized value + */ +inline float dequantize(int16_t value, float scale) +{ + return value * scale; +} + /** Quantize a value given a 16-bit symmetric quantization scheme * * @param[in] value Value to quantize diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 8c154f2059..922fb6acd9 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -94,6 +94,7 @@ #include "arm_compute/runtime/CL/functions/CLIntegralImage.h" #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h" #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" +#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h" #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h" #include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h" diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h index b69930c7d3..fb9724d167 100644 --- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h +++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h @@ -60,7 +60,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. */ - void configure(const std::vector &inputs_vector, ICLTensor *output, size_t axis); + void configure(std::vector &inputs_vector, ICLTensor *output, size_t axis); + void configure(std::vector &inputs_vector, ICLTensor *output, size_t axis); /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer * * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. @@ -73,11 +74,18 @@ public: * @return a status */ static Status validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); + static Status validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); // Inherited methods overridden: void run() override; private: + template + void configure_internal(std::vector &&inputs_vector, ICLTensor *output, size_t axis); + + template + static Status validate_internal(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); + std::vector> _concat_kernels; unsigned int _num_inputs; unsigned int _axis; diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h index 2f7af01a84..ade589d79e 100644 --- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h @@ -40,13 +40,13 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. - * Data types supported: QASYMM8/QSYMM8. + * Data types supported: QASYMM8/QSYMM8/QSYMM16. * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32. */ void configure(const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer * - * @param[in] input Input tensor info. Data types supported: QASYMM8/QSYMM8. + * @param[in] input Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16. * @param[in] output Output tensor info. Data type supported: F16/F32. * * @return a status diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h new file mode 100644 index 0000000000..e2d164c395 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLLSTMLAYERQUANTIZED_H__ +#define __ARM_COMPUTE_CLLSTMLAYERQUANTIZED_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/functions/CLActivationLayer.h" +#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h" +#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" +#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h" +#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLSlice.h" +#include "arm_compute/runtime/CL/functions/CLTranspose.h" + +#include "arm_compute/runtime/common/LSTMParams.h" + +namespace arm_compute +{ +// Forward declarations +class ICLTensor; + +/** Basic function to run @ref CLLSTMLayerQuantized + * + * This function calls the following CL functions/kernels: + * + * -# @ref CLGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers + * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 + * -# @ref CLTranspose Matrix transpose + * -# @ref CLConcatenateLayer Tensor concatenation + * -# @ref CLActivationLayer Activation functions (tanh and logistic) + * -# @ref CLArithmeticAddition Elementwise addition + * -# @ref CLPixelWiseMultiplication Elementwise multiplication + * -# @ref CLSlice Tensor slicing + * -# @ref CLDequantizationLayer Dequantize into float + * -# @ref CLQuantizationLayer Quantize from float + * */ +class CLLSTMLayerQuantized : public IFunction +{ +public: + /** Default constructor */ + CLLSTMLayerQuantized(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLSTMLayerQuantized(const CLLSTMLayerQuantized &) = delete; + /** Default move constructor */ + CLLSTMLayerQuantized(CLLSTMLayerQuantized &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLSTMLayerQuantized &operator=(const CLLSTMLayerQuantized &) = delete; + /** Default move assignment operator */ + CLLSTMLayerQuantized &operator=(CLLSTMLayerQuantized &&) = default; + /** Initialize function's tensors. + * + * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8. + * @param[in] input_to_input_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_output_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_input_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_forget_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_cell_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_output_weights 2D weights tensor with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_gate_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] forget_gate_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] cell_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] output_gate_bias 1D weights tensor with dimensions [output_size]. Data type supported: S32. + * @param[in] cell_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[in] output_state_in 2D tensor with dimensions [output_size, batch_size]. Data type supported: Same as @p input. + * @param[out] cell_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input. + */ + void configure(const ICLTensor *input, + const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, const ICLTensor *output_state_in, + ICLTensor *cell_state_out, ICLTensor *output_state_out); + + /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized + * + * @param[in] input Source tensor info. Input is a 2D tensor info with dimensions [input_size, batch_size]. Data types supported: QASYMM8. + * @param[in] input_to_input_weights 2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_forget_weights 2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_cell_weights 2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_to_output_weights 2D weights tensor info with dimensions [input_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_input_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_forget_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_cell_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] recurrent_to_output_weights 2D weights tensor info with dimensions [output_size, output_size]. Data type supported: Same as @p input. + * @param[in] input_gate_bias 1D weights tensor info with dimensions [output_size]. Data type supported: S32. + * @param[in] forget_gate_bias 1D weights tensor info with dimensions [output_size]. Data type supported: S32. + * @param[in] cell_bias 1D weights tensor info with dimensions [output_size]. Data type supported: S32. + * @param[in] output_gate_bias 1D weights tensor info with dimensions [output_size]. Data type supported: S32. + * @param[in] cell_state_in 2D tensor info with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[in] output_state_in 2D tensor info with dimensions [output_size, batch_size]. Data type supported: Same as @p input. + * @param[out] cell_state_out Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size]. Data type supported: QSYMM16. + * @param[out] output_state_out Destination tensor info. Output is a 2D tensor info with dimensions [output_size, batch_size].Data types supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + CLMemoryGroup _memory_group; + + // Functions used + CLGEMMLowpMatrixMultiplyCore _gemmlowp; + CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage; + CLTranspose _transpose_weights; + CLConcatenateLayer _concat_input_weights; + CLConcatenateLayer _concat_recurrent_weights; + CLConcatenateLayer _concat_weights; + CLConcatenateLayer _concat_inputs; + CLConcatenateLayer _concat_bias; + CLActivationLayer _sigmoid_forget_gate; + CLActivationLayer _sigmoid_input_gate; + CLActivationLayer _sigmoid_output_gate; + CLActivationLayer _tanh_modulation_gate; + CLActivationLayer _tanh_output_state; + CLArithmeticAddition _add_cell_state_tmps; + CLArithmeticAddition _add2; + CLPixelWiseMultiplication _mul_forget_gate_cell_state; + CLPixelWiseMultiplication _mul_input_gate_input_mod_gate; + CLPixelWiseMultiplication _mul_output_state_tmp_output_gate; + CLSlice _slice_input_tensor; + CLSlice _slice_forget_tensor; + CLSlice _slice_cell_tensor; + CLSlice _slice_output_tensor; + CLDequantizationLayer _dequantize; + CLQuantizationLayer _quantize; + + // Tensor pointers + const ICLTensor *_input_to_input_weights; + const ICLTensor *_input_to_forget_weights; + const ICLTensor *_input_to_cell_weights; + const ICLTensor *_input_to_output_weights; + const ICLTensor *_recurrent_to_input_weights; + const ICLTensor *_recurrent_to_forget_weights; + const ICLTensor *_recurrent_to_cell_weights; + const ICLTensor *_recurrent_to_output_weights; + const ICLTensor *_input_gate_bias; + const ICLTensor *_forget_gate_bias; + const ICLTensor *_cell_bias; + const ICLTensor *_output_gate_bias; + + // Temporary tensors + CLTensor _recurrent_weights; + CLTensor _input_weights; + CLTensor _weights; + CLTensor _input; + CLTensor _weights_transposed; + CLTensor _output_highp; + CLTensor _output_lowp; + CLTensor _bias; + CLTensor _forget_gate_input; + CLTensor _input_gate_input; + CLTensor _output_gate_input; + CLTensor _input_modulation_gate_input; + CLTensor _forget_gate_output; + CLTensor _input_gate_output; + CLTensor _output_gate_output; + CLTensor _input_modulation_gate_output; + CLTensor _cell_state_tmp1; + CLTensor _cell_state_tmp2; + CLTensor _output_state_tmp; + CLTensor _output_state_out_symm; + CLTensor _output_state_out_f32; + + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLLSTMLAYERQUANTIZED_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h index 4a336f6fdc..bb97b17fea 100644 --- a/arm_compute/runtime/CL/functions/CLStridedSlice.h +++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,7 +39,7 @@ public: * * @note Supported tensor rank: up to 4 * - * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). @@ -57,7 +57,7 @@ public: * * @note Supported tensor rank: up to 4 * - * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/QSYMM16/U32/S32/F16/F32 * @param[in] output Destination tensor. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h index b45d714990..7f02988c19 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h @@ -53,7 +53,7 @@ class ITensor; * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 * -# @ref NETranspose Matrix transpose * -# @ref NEConcatenateLayer Tensor concatenation - * -# @ref NEActivationLayer Activation functions (tanh and logistig) + * -# @ref NEActivationLayer Activation functions (tanh and logistic) * -# @ref NEArithmeticAddition Elementwise addition * -# @ref NEPixelWiseMultiplication Elementwise multiplication * -# @ref NESlice Tensor slicing diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp index e383bc475d..12d36cdb9f 100644 --- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp @@ -33,14 +33,14 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QSYMM16); if(output->tensor_shape().total_size() > 0) { @@ -135,3 +135,4 @@ void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &qu } while(window_collapsed.slide_window_slice_3D(slice)); } +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp index c2bdf7f299..9dd488b678 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp @@ -48,7 +48,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, + DataType::U16, DataType::S16, DataType::QSYMM16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp index a3ac102564..4e673a9f38 100644 --- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp @@ -84,7 +84,7 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, - DataType::F32); + DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output); ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0)); diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp index d11f04a82f..e52f53ea04 100644 --- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp @@ -194,7 +194,7 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind // Compute left-over elements for(; x < window_end_x; ++x) { - uint8_t val = *(in_ptr + x); + int8_t val = *(in_ptr + x); *(out_ptr + x) = static_cast(dequantize(val, scale)); } }, diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index 1d396f5ebf..5d224db8e9 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -47,14 +47,35 @@ CLConcatenateLayer::CLConcatenateLayer() { } -void CLConcatenateLayer::configure(const std::vector &inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) +{ + configure_internal(std::move(inputs_vector), output, axis); +} + +void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) +{ + configure_internal(std::move(inputs_vector), output, axis); +} + +Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +{ + return validate_internal(inputs_vector, output, axis); +} + +Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +{ + return validate_internal(inputs_vector, output, axis); +} + +template +void CLConcatenateLayer::configure_internal(std::vector &&inputs_vector, ICLTensor *output, size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); _axis = axis; _num_inputs = inputs_vector.size(); std::vector inputs_vector_info(inputs_vector.size()); - std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t) + std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t) { ARM_COMPUTE_ERROR_ON_NULLPTR(t); return t->info(); @@ -141,7 +162,8 @@ void CLConcatenateLayer::configure(const std::vector &inputs_vector } } -Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +template +Status CLConcatenateLayer::validate_internal(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) { ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr); const unsigned int num_inputs = inputs_vector.size(); diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp new file mode 100644 index 0000000000..e0006a77d0 --- /dev/null +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace +{ +// Quantization info structures used in the LSTMQuantize layer +const QuantizationInfo qasymm(1.f / 128.f, 128); +const QuantizationInfo qsymm_3(8.f / 32768.f, 0); // qsymm16 with 3 integer bit +const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit +const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit +} // namespace + +CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), + _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(), + _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), + _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), + _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), + _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(), + _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false) +{ +} + +void CLLSTMLayerQuantized::configure(const ICLTensor *input, + const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights, + const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights, + const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias, + ICLTensor *cell_state_in, const ICLTensor *output_state_in, + ICLTensor *cell_state_out, ICLTensor *output_state_out) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), + recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); + + const int input_size = input->info()->dimension(0); + const int batch_size = input->info()->dimension(1); + const int output_size = input_to_input_weights->info()->dimension(1); + + const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization + + auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + + _input_to_input_weights = input_to_input_weights; + _input_to_forget_weights = input_to_forget_weights; + _input_to_cell_weights = input_to_cell_weights; + _input_to_output_weights = input_to_output_weights; + _recurrent_to_input_weights = recurrent_to_input_weights; + _recurrent_to_forget_weights = recurrent_to_forget_weights; + _recurrent_to_cell_weights = recurrent_to_cell_weights; + _recurrent_to_output_weights = recurrent_to_output_weights; + _input_gate_bias = input_gate_bias; + _forget_gate_bias = forget_gate_bias; + _cell_bias = cell_bias; + _output_gate_bias = output_gate_bias; + + // Weights concatenation + std::vector inputs_weights_vector; + inputs_weights_vector.emplace_back(input_to_input_weights); + inputs_weights_vector.emplace_back(input_to_forget_weights); + inputs_weights_vector.emplace_back(input_to_cell_weights); + inputs_weights_vector.emplace_back(input_to_output_weights); + + std::vector recurrent_weights_vector; + recurrent_weights_vector.emplace_back(recurrent_to_input_weights); + recurrent_weights_vector.emplace_back(recurrent_to_forget_weights); + recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); + recurrent_weights_vector.emplace_back(recurrent_to_output_weights); + + _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); + + _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); + + std::vector weights_vector; + weights_vector.emplace_back(&_recurrent_weights); + weights_vector.emplace_back(&_input_weights); + + _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _concat_weights.configure(weights_vector, &_weights, Window::DimX); + _transpose_weights.configure(&_weights, &_weights_transposed); + + // Input concatenation + std::vector input_vector; + input_vector.emplace_back(input); + input_vector.emplace_back(output_state_in); + + _memory_group.manage(&_input); + _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _concat_inputs.configure(input_vector, &_input, Window::DimX); + + // Bias concatenation + std::vector bias_vector; + bias_vector.emplace_back(input_gate_bias); + bias_vector.emplace_back(forget_gate_bias); + bias_vector.emplace_back(cell_bias); + bias_vector.emplace_back(output_gate_bias); + + _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); + _concat_bias.configure(bias_vector, &_bias, Window::DimX); + + // Invert the offset for gemmlowp + _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); + _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + + // Run gemmlowp + _memory_group.manage(&_output_highp); + _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32)); + _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp); + _input.allocator()->allocate(); + + // Set the offset back + _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); + _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + + // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) + _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); + + const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; + int output_multiplier = 0; + int output_shift = 0; + + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + + _memory_group.manage(&_output_lowp); + _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift); + _output_highp.allocator()->allocate(); + _bias.allocator()->allocate(); + + // Get the gate tensors + _memory_group.manage(&_input_gate_input); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _memory_group.manage(&_forget_gate_input); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _memory_group.manage(&_input_modulation_gate_input); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _memory_group.manage(&_output_gate_input); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _output_lowp.allocator()->allocate(); + + // Forget gate + _memory_group.manage(&_forget_gate_output); + _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_input.allocator()->allocate(); + + // Input gate + _memory_group.manage(&_input_gate_output); + _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_input.allocator()->allocate(); + + // Input modulation gate equation + _memory_group.manage(&_input_modulation_gate_output); + _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_input.allocator()->allocate(); + + // Output gate + _memory_group.manage(&_output_gate_output); + _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_input.allocator()->allocate(); + + // Long term memory + _memory_group.manage(&_cell_state_tmp1); + _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _forget_gate_output.allocator()->allocate(); + + _memory_group.manage(&_cell_state_tmp2); + _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _input_modulation_gate_output.allocator()->allocate(); + _input_gate_output.allocator()->allocate(); + + _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE); + _cell_state_tmp1.allocator()->allocate(); + _cell_state_tmp2.allocator()->allocate(); + + // Short term memory + _memory_group.manage(&_output_state_tmp); + _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + + _memory_group.manage(&_output_state_out_symm); + _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_gate_output.allocator()->allocate(); + _output_state_tmp.allocator()->allocate(); + + // Requantize the output state from QSYMM16 to QASYMM8 + _memory_group.manage(&_output_state_out_f32); + _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); + _output_state_out_symm.allocator()->allocate(); + + _quantize.configure(&_output_state_out_f32, output_state_out); + _output_state_out_f32.allocator()->allocate(); +} + +Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, + const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, + output_state_in, cell_state_out, output_state_out); + + const int input_size = input->dimension(0); + const int batch_size = input->dimension(1); + const int output_size = input_to_input_weights->dimension(1); + + // Dimensionality checks + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(input_to_input_weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); + + TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); + TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + + // Shape checks + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); + + // Data type checks + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); + + // Quantization checks + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); + + if(cell_state_out->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); + } + + if(output_state_out->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_out); + } + + return Status{}; +} + +void CLLSTMLayerQuantized::run() +{ + prepare(); + + // Acquire all the temporaries + MemoryGroupResourceScope scope_mg(_memory_group); + + // Concat and transpose the input + _concat_inputs.run(); + + // Run gemmlowp + _gemmlowp.run(); + _output_stage.run(); + + // Slice the results + _slice_input_tensor.run(); + _slice_forget_tensor.run(); + _slice_cell_tensor.run(); + _slice_output_tensor.run(); + + // Gates + // Forget gate + _sigmoid_forget_gate.run(); + + // Input gate + _sigmoid_input_gate.run(); + + // Input modulation gate + _tanh_modulation_gate.run(); + + // Output gate + _sigmoid_output_gate.run(); + + // Cell state (long term memory) + _mul_forget_gate_cell_state.run(); + _mul_input_gate_input_mod_gate.run(); + _add_cell_state_tmps.run(); + + // Output state (short term memory) + _tanh_output_state.run(); + _mul_output_state_tmp_output_gate.run(); + + // Requantize output state from QSYMM16 to QASYMM16 + _dequantize.run(); + _quantize.run(); +} + +void CLLSTMLayerQuantized::prepare() +{ + if(!_is_prepared) + { + _input_weights.allocator()->allocate(); + _concat_input_weights.run(); + + _input_to_input_weights->mark_as_unused(); + _input_to_forget_weights->mark_as_unused(); + _input_to_cell_weights->mark_as_unused(); + _input_to_output_weights->mark_as_unused(); + + _recurrent_weights.allocator()->allocate(); + _concat_recurrent_weights.run(); + _recurrent_to_input_weights->mark_as_unused(); + _recurrent_to_forget_weights->mark_as_unused(); + _recurrent_to_cell_weights->mark_as_unused(); + _recurrent_to_output_weights->mark_as_unused(); + + _weights.allocator()->allocate(); + _concat_weights.run(); + + _input_weights.mark_as_unused(); + _input_weights.allocator()->free(); + _recurrent_weights.mark_as_unused(); + _recurrent_weights.allocator()->free(); + + _weights_transposed.allocator()->allocate(); + _transpose_weights.run(); + + _weights.mark_as_unused(); + _weights.allocator()->free(); + + _bias.allocator()->allocate(); + _concat_bias.run(); + _input_gate_bias->mark_as_unused(); + _forget_gate_bias->mark_as_unused(); + _cell_bias->mark_as_unused(); + _output_gate_bias->mark_as_unused(); + + _is_prepared = true; + } +} + +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index 05e05a5e57..6cfa9887ff 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -240,7 +240,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); @@ -254,14 +254,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // Data type checks ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); diff --git a/tests/datasets/DatatypeDataset.h b/tests/datasets/DatatypeDataset.h index bb2774b4b3..a158a5f52d 100644 --- a/tests/datasets/DatatypeDataset.h +++ b/tests/datasets/DatatypeDataset.h @@ -43,6 +43,7 @@ public: { DataType::QSYMM8, DataType::QASYMM8, + DataType::QSYMM16, }) { } diff --git a/tests/validation/CL/BatchConcatenateLayer.cpp b/tests/validation/CL/BatchConcatenateLayer.cpp index b789569155..6c4ffee1dc 100644 --- a/tests/validation/CL/BatchConcatenateLayer.cpp +++ b/tests/validation/CL/BatchConcatenateLayer.cpp @@ -97,9 +97,12 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - CLConcatenateLayer concat_layer; - - concat_layer.configure({ &src1, &src2, &src3 }, &dst, 3); + CLConcatenateLayer concat_layer; + std::vector inputs; + inputs.emplace_back(&src1); + inputs.emplace_back(&src2); + inputs.emplace_back(&src3); + concat_layer.configure(inputs, &dst, 3); } template using CLBatchConcatenateLayerFixture = ConcatenateLayerValidationFixture; diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp index 8cbfda382b..c67ed05ecd 100644 --- a/tests/validation/CL/DepthConcatenateLayer.cpp +++ b/tests/validation/CL/DepthConcatenateLayer.cpp @@ -94,9 +94,12 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - CLConcatenateLayer concat_layer; - - concat_layer.configure({ &src1, &src2, &src3 }, &dst, 2); + CLConcatenateLayer concat_layer; + std::vector inputs; + inputs.emplace_back(&src1); + inputs.emplace_back(&src2); + inputs.emplace_back(&src3); + concat_layer.configure(inputs, &dst, 2); } template using CLDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture; diff --git a/tests/validation/CL/LSTMLayerQuantized.cpp b/tests/validation/CL/LSTMLayerQuantized.cpp new file mode 100644 index 0000000000..1fc0af1ecb --- /dev/null +++ b/tests/validation/CL/LSTMLayerQuantized.cpp @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/PaddingCalculator.h" +#include "tests/Utils.h" +#include "tests/datasets/LSTMLayerDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" + +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +template +inline void fill_tensor(CLTensor &tensor, const std::vector &v) +{ + tensor.map(true); + // Import memory accounting for padding + TensorShape t_shape = tensor.info()->tensor_shape(); + Window window; + window.use_tensor_dimensions(t_shape); + Iterator out(&tensor, window); + execute_window_loop(window, [&](const Coordinates & id) + { + *reinterpret_cast(out.ptr()) = v[coord2index(t_shape, id)]; + }, + out); + tensor.unmap(); +} + +template +inline void fill_tensor(SimpleTensor &tensor, const std::vector &v) +{ + std::memcpy(tensor.data(), v.data(), sizeof(T) * v.size()); +} + +} // namespace + +TEST_SUITE(CL) +TEST_SUITE(LSTMLayerQuantized) + +// *INDENT-OFF* +// clang-format off +TEST_CASE(IntegrationTestCaseSmall, framework::DatasetMode::PRECOMMIT) +{ + const int batch_size = 2; + const int input_size = 2; + const int output_size = 4; + + + QuantizationInfo qasymm(1.f / 128.f, 128); + QuantizationInfo qweights(1.f / 128.f, 128); + QuantizationInfo qsymm_3(8.f / 32768.f, 0); + QuantizationInfo qsymm_4(16.f / 32768.f, 0); + + TensorShape input_shape{ input_size, batch_size }; + TensorShape input_weights_shape{ input_size, output_size }; + TensorShape recurrent_weights_shape{ output_size, output_size }; + TensorShape output_shape{ output_size, batch_size}; + TensorShape bias_shape{ output_size }; + + auto input_to_input_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_to_forget_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_to_cell_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_to_output_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_input_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_forget_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_cell_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_output_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_gate_bias = create_tensor(bias_shape, DataType::S32); + auto forget_gate_bias = create_tensor(bias_shape, DataType::S32); + auto cell_gate_bias = create_tensor(bias_shape, DataType::S32); + auto output_gate_bias = create_tensor(bias_shape, DataType::S32); + + // LSTM input + auto input = create_tensor(input_shape, DataType::QASYMM8, 1, qasymm); + + // LSTM output state + auto output_state = create_tensor(output_shape, DataType::QASYMM8, 1, qasymm); + + // LSTM cell state + auto cell_state = create_tensor(output_shape, DataType::QSYMM16, 1, qsymm_4); + + CLLSTMLayerQuantized lstmq; + + lstmq.configure(&input, &input_to_input_weights, &input_to_forget_weights, &input_to_cell_weights, &input_to_output_weights, + &recurrent_to_input_weights, &recurrent_to_forget_weights, &recurrent_to_cell_weights, &recurrent_to_output_weights, + &input_gate_bias, &forget_gate_bias, &cell_gate_bias, &output_gate_bias, &cell_state, &output_state, &cell_state, &output_state); + + input.allocator()->allocate(); + input_to_input_weights.allocator()->allocate(); + input_to_forget_weights.allocator()->allocate(); + input_to_cell_weights.allocator()->allocate(); + input_to_output_weights.allocator()->allocate(); + recurrent_to_input_weights.allocator()->allocate(); + recurrent_to_forget_weights.allocator()->allocate(); + recurrent_to_cell_weights.allocator()->allocate(); + recurrent_to_output_weights.allocator()->allocate(); + input_gate_bias.allocator()->allocate(); + forget_gate_bias.allocator()->allocate(); + cell_gate_bias.allocator()->allocate(); + output_gate_bias.allocator()->allocate(); + cell_state.allocator()->allocate(); + output_state.allocator()->allocate(); + + // Fill weights and biases + fill_tensor(input_to_input_weights, std::vector{ 47, 168, + 66, 239, + 6, 42, + 237, 236 }); + + fill_tensor(input_to_forget_weights, std::vector { 204, 193, + 148, 59, + 113, 17, + 66, 197 }); + + fill_tensor(input_to_cell_weights, std::vector { 172, 101, + 184, 209, + 165, 82, + 108, 209 }); + + fill_tensor(input_to_output_weights, std::vector { 203, 244, + 219, 114, + 130, 16, + 163, 222 }); + + fill_tensor(recurrent_to_input_weights, std::vector { 162, 168, 7, 95, + 91, 155, 108, 216, + 255, 100, 48, 188, + 58, 37, 186, 147 }); + + fill_tensor(recurrent_to_forget_weights, std::vector { 46, 58, 47, 170, + 246, 96, 12, 99, + 68, 23, 186, 161, + 237, 164, 89, 6 }); + + fill_tensor(recurrent_to_cell_weights, std::vector { 234, 99, 71, 206, + 205, 159, 64, 253, + 191, 148, 116, 8, + 209, 136, 59, 138 }); + + fill_tensor(recurrent_to_output_weights, std::vector { 23, 241, 137, 36, + 206, 5, 227, 56, + 254, 176, 231, 47, + 18, 201, 161, 11 }); + + fill_tensor(input_gate_bias, std::vector {-103038, 30525, 115255, -38154 }); + fill_tensor(forget_gate_bias, std::vector { -23428, 126970, 116806, 46307 }); + fill_tensor(cell_gate_bias, std::vector { 128006, 69949, -42808, 42568 }); + fill_tensor(output_gate_bias, std::vector { -67066, -53607, 47233, 7300 }); + + SimpleTensor expected_output(output_shape, DataType::QASYMM8, 1, qasymm); + + // Initialize state + fill_tensor(output_state, std::vector { 128, 128, 128, 128, + 128, 128, 128, 128 }); + fill_tensor(cell_state, std::vector { 0, 0, 0, 0, + 0, 0, 0, 0 }); + + // First input + fill_tensor(input, std::vector { 106, 193, + 155, 150 }); + + fill_tensor(expected_output, std::vector { 128, 130, 36, 134, + 128, 131, 35, 133 }); + + lstmq.run(); + validate(CLAccessor(output_state), expected_output); + + // Second input + fill_tensor(expected_output, std::vector { 128, 129, 12, 137, + 128, 131, 10, 136 }); + lstmq.run(); + validate(CLAccessor(output_state), expected_output); + + // Third input + fill_tensor(expected_output, std::vector { 128, 129, 8, 140, + 128, 130, 6, 138 }); + lstmq.run(); + validate(CLAccessor(output_state), expected_output); +} + +TEST_CASE(IntegrationTestCaseLarge, framework::DatasetMode::PRECOMMIT) +{ + const int batch_size = 16; + const int input_size = 8; + const int output_size = 8; + + + QuantizationInfo qasymm(1.f / 128.f, 128); + QuantizationInfo qweights(1.f / 128.f, 128); + QuantizationInfo qsymm_3(8.f / 32768.f, 0); + QuantizationInfo qsymm_4(16.f / 32768.f, 0); + + TensorShape input_shape{ input_size, batch_size }; + TensorShape input_weights_shape{ input_size, output_size }; + TensorShape recurrent_weights_shape{ output_size, output_size }; + TensorShape output_shape{ output_size, batch_size}; + TensorShape bias_shape{ output_size }; + + auto input_to_input_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_to_forget_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_to_cell_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_to_output_weights = create_tensor(input_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_input_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_forget_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_cell_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto recurrent_to_output_weights = create_tensor(recurrent_weights_shape, DataType::QASYMM8, 1, qweights); + auto input_gate_bias = create_tensor(bias_shape, DataType::S32); + auto forget_gate_bias = create_tensor(bias_shape, DataType::S32); + auto cell_gate_bias = create_tensor(bias_shape, DataType::S32); + auto output_gate_bias = create_tensor(bias_shape, DataType::S32); + + // LSTM input + auto input = create_tensor(input_shape, DataType::QASYMM8, 1, qasymm); + + // LSTM output state + auto output_state = create_tensor(output_shape, DataType::QASYMM8, 1, qasymm); + + // LSTM cell state + auto cell_state = create_tensor(output_shape, DataType::QSYMM16, 1, qsymm_4); + + CLLSTMLayerQuantized lstmq; + + lstmq.configure(&input, &input_to_input_weights, &input_to_forget_weights, &input_to_cell_weights, &input_to_output_weights, + &recurrent_to_input_weights, &recurrent_to_forget_weights, &recurrent_to_cell_weights, &recurrent_to_output_weights, + &input_gate_bias, &forget_gate_bias, &cell_gate_bias, &output_gate_bias, &cell_state, &output_state, &cell_state, &output_state); + + input.allocator()->allocate(); + input_to_input_weights.allocator()->allocate(); + input_to_forget_weights.allocator()->allocate(); + input_to_cell_weights.allocator()->allocate(); + input_to_output_weights.allocator()->allocate(); + recurrent_to_input_weights.allocator()->allocate(); + recurrent_to_forget_weights.allocator()->allocate(); + recurrent_to_cell_weights.allocator()->allocate(); + recurrent_to_output_weights.allocator()->allocate(); + input_gate_bias.allocator()->allocate(); + forget_gate_bias.allocator()->allocate(); + cell_gate_bias.allocator()->allocate(); + output_gate_bias.allocator()->allocate(); + cell_state.allocator()->allocate(); + output_state.allocator()->allocate(); + + // Fill weights and biases + fill_tensor(input_to_input_weights, std::vector{ 141, 89, 200, 180, 46, 50, 87, 128, + 149, 227, 177, 187, 212, 229, 54, 111, + 131, 116, 3, 58, 196, 26, 131, 255, + 22, 106, 216, 69, 239, 12, 232, 207, + 184, 56, 236, 172, 28, 143, 161, 124, + 255, 33, 197, 122, 47, 197, 26, 229, + 91, 79, 11, 160, 26, 80, 100, 36, + 248, 186, 97, 61, 125, 46, 14, 100, }); + + fill_tensor(input_to_forget_weights, std::vector { 237, 165, 141, 249, 72, 116, 36 , 115, + 234, 213, 85, 84, 59, 62, 150, 246, + 182, 102, 158, 214, 182, 183, 94, 11, + 158, 192, 92, 189, 160, 219, 206, 249, + 88, 213, 193, 244, 151, 72, 129, 49, + 239, 83, 106, 9, 169, 187, 125, 171, + 32, 141, 126, 92, 13, 36, 224, 150, + 187, 250, 178, 169, 89, 214, 91, 173 }); + + fill_tensor(input_to_cell_weights, std::vector { 93, 103, 226, 139, 185, 252, 129, 171, + 159, 32, 25, 175, 224, 183, 165, 35, + 207, 69, 238, 228, 149, 214, 79, 6, + 5, 66, 102, 14, 19, 111, 36, 143, + 22, 85, 13, 78, 236, 121, 122, 77, + 249, 39, 88, 12, 205, 143, 93, 240, + 167, 89, 188, 50, 73, 69, 201, 251, + 59, 32, 203, 184, 139, 191, 199, 74}); + + fill_tensor(input_to_output_weights, std::vector { 205, 7, 95, 104, 252, 143, 226, 73, + 229, 114, 152, 171, 221, 153, 73, 229, + 153, 165, 223, 239, 100, 38, 172, 211, + 226, 133, 239, 207, 116, 230, 170, 100, + 241, 95, 171, 124, 63, 115, 32, 127, + 141, 239, 53, 193, 201, 53, 104, 178, + 186, 212, 167, 107, 226, 230, 71, 213, + 148, 217, 19, 248, 233, 195, 183, 156 }); + + fill_tensor(recurrent_to_input_weights, std::vector { 147, 112, 140, 103, 3, 255, 17, 49, + 84, 112, 144, 213, 138, 142, 112, 66, + 117, 30, 101, 35, 25, 132, 211, 229, + 183, 208, 102, 16, 38, 85, 101, 152, + 226, 83, 132, 22, 161, 110, 157, 129, + 184, 63, 168, 42, 220, 126, 209, 157, + 5, 88, 243, 83, 249, 19, 226, 209, + 173, 96, 185, 77, 146, 227, 238, 136 }); + + + fill_tensor(recurrent_to_forget_weights, std::vector { 52, 132, 92, 200, 213, 32, 213, 37, + 116, 142, 116, 180, 4, 172, 158, 143, + 110, 40, 99, 28, 221, 153, 133, 2, + 247, 144, 198, 100, 20, 15, 221, 196, + 159, 178, 188, 151, 171, 15, 25, 217, + 178, 109, 110, 118, 128, 39, 232, 234, + 184, 214, 177, 13, 56, 6, 28, 252, + 89, 187, 242, 59, 146, 111, 132, 129}); + + fill_tensor(recurrent_to_cell_weights, std::vector { 70, 44, 137, 29, 36, 127, 1, 241, + 26, 241, 142, 114, 67, 181, 49, 57, + 131, 152, 175, 77, 23, 63, 37, 124, + 150, 113, 95, 103, 110, 201, 69, 97, + 196, 242, 62, 214, 66, 19, 45, 135, + 22, 168, 149, 104, 77, 101, 36, 68, + 170, 116, 222, 100, 109, 1, 154, 18, + 133, 215, 105, 93, 31, 57, 231, 112 }); + + + fill_tensor(recurrent_to_output_weights, std::vector { 45 , 181 , 220 , 219 , 49 , 63 , 49 , 129, + 7 , 166 , 104 , 114 , 83 , 40 , 1 , 195, + 245 , 142 , 82 , 232 , 104 , 245 , 82 , 196, + 111 , 56 , 156 , 9 , 141 , 240 , 180 , 148, + 247 , 198 , 234 , 137 , 13 , 210 , 161 , 192, + 196 , 59 , 233 , 184 , 142 , 187 , 140 , 166, + 2 , 95 , 152 , 46 , 71 , 46 , 113 , 32, + 175 , 229 , 86 , 87 , 62 , 93 , 74 , 130}); + + fill_tensor(input_gate_bias, std::vector { -40040, -106916, -92315, -79123, 45160, -17954, 50962, -63758 }); + fill_tensor(forget_gate_bias, std::vector { -128514, 8463, -57831, 116977, 106547, -28132, -124557, 44941 }); + fill_tensor(cell_gate_bias, std::vector { 88388 , 123601, -116148, -13022, 21619, 48926, 57523, 39332 }); + fill_tensor(output_gate_bias, std::vector { 59485 , -33070, 21386, -100633, -115959, 125768, -56407, 24897 }); + + SimpleTensor expected_output(output_shape, DataType::QASYMM8, 1, qasymm); + + // Initialize state + fill_tensor(output_state, std::vector { 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 }); + + fill_tensor(cell_state, std::vector { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}); + + // First input + fill_tensor(input, std::vector { 247, 203, 159, 131, 182, 114, 207, 195, + 48 , 61 , 154, 16, 80, 101, 116, 255, + 50 , 115 , 45, 186, 75, 212, 98, 48, + 88 , 146 , 24, 143, 218, 174, 203, 200, + 239 , 16 , 66, 136, 234, 54, 94, 51, + 101 , 128 , 220, 213, 164, 82, 137, 255, + 70 , 165 , 234, 220, 66, 35, 183, 206, + 39 , 57 , 180, 202, 23, 172, 224, 109, + 102 , 215 , 186, 82, 215, 147, 85, 187, + 96 , 249 , 59, 116, 150, 44, 167, 128, + 34 , 217 , 148, 193, 243, 38, 250, 208, + 112 , 130 , 208, 29, 16, 122, 20, 92, + 24 , 72 , 104, 29, 150, 233, 151, 19, + 158 , 192 , 254, 70, 73, 142, 106, 152, + 3 , 61 , 24, 135, 212, 9, 80, 234, + 147 , 246 , 83, 249, 49, 14, 68, 50}); + + fill_tensor(expected_output, std::vector {131, 128, 128, 128, 128, 180, 129, 133, + 136, 128, 126, 128, 128, 173, 135, 130, + 160, 128, 128, 128, 128, 138, 132, 129, + 131, 128, 127, 128, 128, 169, 129, 131, + 133, 128, 128, 128, 128, 182, 130, 129, + 131, 128, 128, 128, 128, 163, 129, 130, + 131, 128, 128, 128, 128, 149, 132, 129, + 143, 128, 127, 128, 128, 150, 134, 131, + 134, 128, 128, 128, 128, 167, 130, 130, + 131, 128, 128, 128, 128, 152, 132, 129, + 128, 128, 128, 128, 128, 169, 130, 130, + 173, 128, 128, 128, 128, 148, 139, 130, + 152, 128, 128, 128, 128, 168, 139, 132, + 147, 128, 128, 128, 128, 161, 131, 132, + 130, 128, 128, 128, 128, 159, 134, 128, + 140, 128, 128, 128, 128, 133, 132, 128 }); + + lstmq.run(); + validate(CLAccessor(output_state), expected_output); + + // Second input + fill_tensor(expected_output, std::vector { 130, 128, 128, 128, 128, 205, 129, 137, + 135, 128, 127, 128, 128, 190, 137, 132, + 160, 128, 128, 128, 128, 142, 133, 131, + 130, 128, 128, 128, 128, 185, 129, 133, + 132, 128, 128, 128, 128, 198, 131, 130, + 130, 128, 128, 128, 128, 178, 130, 131, + 131, 128, 128, 128, 128, 158, 132, 131, + 142, 128, 127, 128, 128, 158, 135, 134, + 133, 128, 128, 128, 128, 178, 131, 132, + 131, 128, 128, 128, 128, 160, 132, 130, + 128, 128, 128, 128, 128, 190, 131, 131, + 170, 128, 128, 128, 128, 157, 142, 131, + 149, 128, 128, 128, 128, 178, 142, 135, + 145, 128, 128, 128, 129, 173, 132, 135, + 129, 128, 128, 128, 128, 171, 134, 129, + 140, 128, 128, 128, 128, 135, 132, 129}); + lstmq.run(); + validate(CLAccessor(output_state), expected_output); +} +// clang-format on +// *INDENT-ON* + +TEST_SUITE_END() // LSTMLayerQuantized +TEST_SUITE_END() // NEON +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp index 52a4e4ccd6..7b894a63e0 100644 --- a/tests/validation/CL/WidthConcatenateLayer.cpp +++ b/tests/validation/CL/WidthConcatenateLayer.cpp @@ -98,9 +98,12 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - CLConcatenateLayer concat_layer; - - concat_layer.configure({ &src1, &src2, &src3 }, &dst, 0); + CLConcatenateLayer concat_layer; + std::vector inputs; + inputs.emplace_back(&src1); + inputs.emplace_back(&src2); + inputs.emplace_back(&src3); + concat_layer.configure(inputs, &dst, 0); } template diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp index 41c12c91e7..d5d036de33 100644 --- a/tests/validation/NEON/LSTMLayerQuantized.cpp +++ b/tests/validation/NEON/LSTMLayerQuantized.cpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" + #include "tests/NEON/Accessor.h" #include "tests/PaddingCalculator.h" #include "tests/Utils.h" @@ -131,8 +131,6 @@ TEST_CASE(IntegrationTestCaseSmall, framework::DatasetMode::PRECOMMIT) output_gate_bias.allocator()->allocate(); cell_state.allocator()->allocate(); output_state.allocator()->allocate(); - cell_state.allocator()->allocate(); - output_state.allocator()->allocate(); // Fill weights and biases fill_tensor(input_to_input_weights, std::vector{ 47, 168, @@ -452,7 +450,7 @@ TEST_CASE(IntegrationTestCaseLarge, framework::DatasetMode::PRECOMMIT) // *INDENT-ON* TEST_SUITE_END() // LSTMLayerQuantized -TEST_SUITE_END() // NEON +TEST_SUITE_END() // CL } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/fixtures/DequantizationLayerFixture.h b/tests/validation/fixtures/DequantizationLayerFixture.h index 15f3711189..2c8f05746d 100644 --- a/tests/validation/fixtures/DequantizationLayerFixture.h +++ b/tests/validation/fixtures/DequantizationLayerFixture.h @@ -92,32 +92,46 @@ protected: SimpleTensor compute_reference(const TensorShape &shape, DataType src_data_type) { - if(is_data_type_quantized_asymmetric(src_data_type)) + if(src_data_type == DataType::QASYMM8) { SimpleTensor src{ shape, src_data_type, 1, _quantization_info }; fill(src); return reference::dequantization_layer(src); } - else + else if(src_data_type == DataType::QSYMM8) { SimpleTensor src{ shape, src_data_type, 1, _quantization_info }; fill(src); return reference::dequantization_layer(src); } + else if(src_data_type == DataType::QSYMM16) + { + SimpleTensor src{ shape, src_data_type, 1, _quantization_info }; + fill(src); + return reference::dequantization_layer(src); + } + else + { + ARM_COMPUTE_ERROR("Unsupported data type"); + } } protected: QuantizationInfo generate_quantization_info(DataType data_type) { - std::uniform_int_distribution<> distribution(1, 127); std::mt19937 gen(library.get()->seed()); + std::uniform_int_distribution<> distribution_scale_q8(1, 255); + std::uniform_int_distribution<> distribution_offset_q8(1, 127); + std::uniform_int_distribution<> distribution_scale_q16(1, 32768); switch(data_type) { + case DataType::QSYMM16: + return QuantizationInfo(1.f / distribution_scale_q16(gen)); case DataType::QSYMM8: - return QuantizationInfo(1.f / distribution(gen)); + return QuantizationInfo(1.f / distribution_scale_q8(gen)); case DataType::QASYMM8: - return QuantizationInfo(1.f / distribution(gen), distribution(gen)); + return QuantizationInfo(1.f / distribution_scale_q8(gen), distribution_offset_q8(gen)); default: ARM_COMPUTE_ERROR("Unsupported data type"); } diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp index d07371c883..cceee0421c 100644 --- a/tests/validation/reference/DequantizationLayer.cpp +++ b/tests/validation/reference/DequantizationLayer.cpp @@ -45,6 +45,11 @@ TOut dequantize(uint8_t val, const UniformQuantizationInfo qinfo) { return static_cast(dequantize_qasymm8(val, qinfo)); } +template +TOut dequantize(int16_t val, const UniformQuantizationInfo qinfo) +{ + return static_cast(dequantize_qsymm16(val, qinfo)); +} template SimpleTensor dequantization_layer_nchw(const SimpleTensor &src) @@ -72,7 +77,7 @@ SimpleTensor dequantization_layer_nchw(const SimpleTensor &src) // Dequantize slice for(int s = 0; s < WH; ++s) { - dst[idx + s] = dequantize(src[idx + s], channel_qinfo); + dst[idx + s] = dequantize(static_cast(src[idx + s]), channel_qinfo); } } } @@ -84,7 +89,7 @@ SimpleTensor dequantization_layer_nchw(const SimpleTensor &src) for(int i = 0; i < src.num_elements(); ++i) { - dst[i] = static_cast(dequantize(src[i], quantization_info)); + dst[i] = static_cast(dequantize(static_cast(src[i]), quantization_info)); } } @@ -109,6 +114,8 @@ template SimpleTensor dequantization_layer(const SimpleTensor &sr template SimpleTensor dequantization_layer(const SimpleTensor &src); template SimpleTensor dequantization_layer(const SimpleTensor &src); template SimpleTensor dequantization_layer(const SimpleTensor &src); +template SimpleTensor dequantization_layer(const SimpleTensor &src); +template SimpleTensor dequantization_layer(const SimpleTensor &src); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1