From b18252dce941001d8980721596709ea01d55747a Mon Sep 17 00:00:00 2001 From: Sheri Zhang Date: Tue, 7 Apr 2020 11:04:57 +0100 Subject: COMPMID-3239: Implement QSYMM16 LayerNormalizationKernel for CL Signed-off-by: Sheri Zhang Change-Id: Ib1577c4a9aa29293a903731b2a4083b0d2243e1e Signed-off-by: Sheri Zhang Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2994 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Sang-Hoon Park Reviewed-by: Michele Di Giorgio --- Android.bp | 1 + arm_compute/core/CL/CLKernels.h | 1 + .../CL/kernels/CLQLSTMLayerNormalizationKernel.h | 88 +++++++ .../NEON/functions/NEBoundingBoxTransform.h | 4 +- src/core/CL/CLKernelLibrary.cpp | 5 + src/core/CL/cl_kernels/gemmlowp.cl | 2 +- src/core/CL/cl_kernels/helpers_asymm.h | 34 ++- .../CL/cl_kernels/qlstm_layer_normalization.cl | 260 +++++++++++++++++++++ .../CL/kernels/CLQLSTMLayerNormalizationKernel.cpp | 166 +++++++++++++ tests/validation/CL/QLSTMLayerNormalization.cpp | 197 ++++++++++++++++ tests/validation/NEON/QLSTMLayerNormalization.cpp | 2 +- .../fixtures/QLSTMLayerNormalizationFixture.h | 34 ++- 12 files changed, 785 insertions(+), 9 deletions(-) create mode 100644 arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h create mode 100644 src/core/CL/cl_kernels/qlstm_layer_normalization.cl create mode 100644 src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp create mode 100644 tests/validation/CL/QLSTMLayerNormalization.cpp diff --git a/Android.bp b/Android.bp index 7c63e6f002..80ea3e36b8 100644 --- a/Android.bp +++ b/Android.bp @@ -173,6 +173,7 @@ cc_library_static { "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp", "src/core/CL/kernels/CLPoolingLayerKernel.cpp", "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp", + "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp", "src/core/CL/kernels/CLQuantizationLayerKernel.cpp", "src/core/CL/kernels/CLROIAlignLayerKernel.cpp", "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp", diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index b265aa2fe7..583cf270e2 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -125,6 +125,7 @@ #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h" #include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" #include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h" #include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h" #include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h" diff --git a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h new file mode 100644 index 0000000000..3206eda729 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H +#define ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the kernel to do layer normalization. */ +class CLQLSTMLayerNormalizationKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLQLSTMLayerNormalizationKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLQLSTMLayerNormalizationKernel(const CLQLSTMLayerNormalizationKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLQLSTMLayerNormalizationKernel &operator=(const CLQLSTMLayerNormalizationKernel &) = delete; + /** Allow instances of this class to be moved */ + CLQLSTMLayerNormalizationKernel(CLQLSTMLayerNormalizationKernel &&) = default; + /** Allow instances of this class to be moved */ + CLQLSTMLayerNormalizationKernel &operator=(CLQLSTMLayerNormalizationKernel &&) = default; + /** Default destructor */ + ~CLQLSTMLayerNormalizationKernel() = default; + /** Initialise the kernel's input and outputs. + * + * @param[in] input Source tensor with 2 dimensions. Data types supported: QSYMM16. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] weight Weight tensor. Data types supported: Same as @p input. + * @param[in] bias Bias tensor. Data types supported: S32. + * + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias); + /** Initialise the kernel's input and outputs. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor with 2 dimensions. Data types supported: QSYMM16. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] weight Weight tensor. Data types supported: Same as @p input. + * @param[in] bias Bias tensor. Data types supported: S32. + */ + void configure(CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias); + /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel + * + * @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16. + * @param[in] output Destination info tensor. Data type supported: same as @p input + * @param[in] weight Weight info tensor. Data types supported: Same as @p input. + * @param[in] bias Bias tensor info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_weight; + const ICLTensor *_bias; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H */ diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h index 36a35d3890..27c1c5198b 100644 --- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h +++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -66,4 +66,4 @@ public: static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); }; } // namespace arm_compute -#endif /* ARM_COMPUTE_CLBOUNDINGBOXTRANSFORM_H */ +#endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 7437f1bf22..00e7b2bc5c 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -326,6 +326,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" }, { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" }, { "prior_box_layer_nchw", "prior_box_layer.cl" }, + { "qlstm_layer_normalization", "qlstm_layer_normalization.cl" }, { "quantization_layer", "quantization_layer.cl" }, { "range", "range.cl" }, { "range_quantized", "range.cl" }, @@ -822,6 +823,10 @@ const std::map CLKernelLibrary::_program_source_map = { "prior_box_layer.cl", #include "./cl_kernels/prior_box_layer.clembed" + }, + { + "qlstm_layer_normalization.cl", +#include "./cl_kernels/qlstm_layer_normalization.clembed" }, { "quantization_layer.cl", diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index b707ec8175..d6f415b715 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -2277,7 +2277,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor - * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8 + * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QSYMM16 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 6377dbadb1..37eb246bfb 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -145,9 +145,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) b_64 = convert_long##size(b); \ VEC_DATA_TYPE(long, size) \ ab_64 = a_64 * b_64; \ - /* COMPMID-907 */ \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ return select(ab_x2_high32, INT_MAX, overflow); \ } @@ -397,6 +407,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ + } +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) + QUANTIZE_IMPL(uchar, 1) QUANTIZE_IMPL(char, 1) QUANTIZE_IMPL(uint, 1) @@ -442,16 +461,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) +ASYMM_SELECT_USING_MASK_IMPL(1) ASYMM_SELECT_USING_MASK_IMPL(2) ASYMM_SELECT_USING_MASK_IMPL(4) ASYMM_SELECT_USING_MASK_IMPL(8) ASYMM_SELECT_USING_MASK_IMPL(16) +ASYMM_MASK_IF_ZERO_IMPL(1) ASYMM_MASK_IF_ZERO_IMPL(2) ASYMM_MASK_IF_ZERO_IMPL(4) ASYMM_MASK_IF_ZERO_IMPL(8) ASYMM_MASK_IF_ZERO_IMPL(16) +ASYMM_MASK_IF_NON_ZERO_IMPL(1) ASYMM_MASK_IF_NON_ZERO_IMPL(2) ASYMM_MASK_IF_NON_ZERO_IMPL(4) ASYMM_MASK_IF_NON_ZERO_IMPL(8) @@ -467,6 +489,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) @@ -482,9 +505,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) +ASYMM_RESCALE_IMPL(1) ASYMM_RESCALE_IMPL(2) ASYMM_RESCALE_IMPL(4) ASYMM_RESCALE_IMPL(8) ASYMM_RESCALE_IMPL(16) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) + #endif // ARM_COMPUTE_HELPERS_ASYMM_H diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/qlstm_layer_normalization.cl new file mode 100644 index 0000000000..08f0b53632 --- /dev/null +++ b/src/core/CL/cl_kernels/qlstm_layer_normalization.cl @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" + +#if VEC_SIZE == 2 +#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2) +#define PERFORM_REDUCTION_IMPL(type) \ + inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \ + { \ + sum.s0 += sum.s1; \ + return sum.s0; \ + } +#elif VEC_SIZE == 4 +#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4) +#define PERFORM_REDUCTION_IMPL(type) \ + inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \ + { \ + sum.s01 += sum.s23; \ + sum.s0 += sum.s1; \ + return sum.s0; \ + } +#elif VEC_SIZE == 8 +#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8) +#define PERFORM_REDUCTION_IMPL(type) \ + inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \ + { \ + sum.s0123 += sum.s4567; \ + sum.s01 += sum.s23; \ + sum.s0 += sum.s1; \ + return sum.s0; \ + } +#else /* VEC_SIZE DEFAULT */ +#define VEC_SIZE 16 +#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16) +#define PERFORM_REDUCTION_IMPL(type) \ + inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \ + { \ + sum.s01234567 += sum.s89abcdef; \ + sum.s0123 += sum.s4567; \ + sum.s01 += sum.s23; \ + sum.s0 += sum.s1; \ + return sum.s0; \ + } +#endif /* VEC_SIZE END */ + +#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input) +#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type) + +PERFORM_REDUCTION_IMPL(int) +PERFORM_REDUCTION_IMPL(long) + +/** Compute quantized multiplier and shift for the inverse square root of input. + * Using 3-bit fixed point and 5 iteration of Newton-Raphson method. + * + * @param[in] in Input to use + * @param[in] reverse_shift -1 to reverse the shift direction + * + * @return: + * .s0 Quantized multiplier for inverse square root + * .s1 Shift for inverse square root + * + */ +inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift) +{ + int2 stddev_inv; + int stddev_inv_multiplier = INT_MAX; + int stddev_inv_shift = 0; + int input = in; + if(input <= 1) + { + stddev_inv.s0 = stddev_inv_multiplier; + stddev_inv.s1 = stddev_inv_shift; + return stddev_inv; + } + + stddev_inv_shift = 11; + while(input >= (1 << 29)) + { + input /= 4; + ++stddev_inv_shift; + } + + const unsigned int max_left_shift_bits = clz(input) - 1; + const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2; + const unsigned int left_shift_bit_pairs = max_left_shift_bits_pairs - 1; + stddev_inv_shift -= left_shift_bit_pairs; + input <<= 2 * left_shift_bit_pairs; + + typedef int FixedPointRawType; + const unsigned int fixedpoint_position = 3; + const unsigned int fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position; + typedef FixedPointRawType FixedPoint3; + typedef FixedPointRawType FixedPoint0; + + const FixedPoint3 fixedpoint_input = (input >> 1); + const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1); + const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1)); + FixedPoint3 x = 0x1 << fixedpoint_int_position; + + const int num_iteration = 5; + for(int i = 0; i < num_iteration; i++) + { + int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1); + x = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1); + } + const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250; + x = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1); + stddev_inv_multiplier = x; + if(stddev_inv_shift < 0) + { + stddev_inv_multiplier <<= -stddev_inv_shift; + stddev_inv_shift = 0; + } + stddev_inv_shift *= reverse_shift; + + stddev_inv.s0 = stddev_inv_multiplier; + stddev_inv.s1 = stddev_inv_shift; + return stddev_inv; +} + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +/** This function implements QLSTM layer normalization. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: QSYMM16 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] weight_ptr Pointer to the weight tensor. Supported data type: same as @p input_ptr + * @param[in] weight_stride_x Stride of the weight tensor in X dimension (in bytes) + * @param[in] weight_step_x weight_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weight_offset_first_element_in_bytes The offset of the first element in the weight tensor + * @param[in] bias_ptr Pointer to the bias tensor. Supported data type: S32 + * @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes) + * @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void qlstm_layer_normalization( + IMAGE_DECLARATION(input), + VECTOR_DECLARATION(weight), + VECTOR_DECLARATION(bias), + IMAGE_DECLARATION(output)) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Vector weight = CONVERT_TO_VECTOR_STRUCT(weight); + Vector bias = CONVERT_TO_VECTOR_STRUCT(bias); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(int, VEC_SIZE) + sum = 0; + VEC_DATA_TYPE(long, VEC_SIZE) + sum_sq = 0; + // Calculate partial sum + int i = 0; + for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE) + { + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0)); + + sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)); + sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)); + } + // Perform reduction + sum.s0 = PERFORM_REDUCTION(sum, int); + sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long); + + // Left-overs loop + for(; i < WIDTH; ++i) + { + DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0)); + + sum.s0 += CONVERT(data, int); + sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long); + } + + int temp = 0x100000 / WIDTH; + int mean = (int)(sum.s0 * 1024 / WIDTH); + int var2 = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000; + int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1); + + i = 0; + for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0)); + VEC_DATA_TYPE(int, VEC_SIZE) + res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean; + res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + w = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i)); + res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE)); + res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i)); + // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024; + res = (res + 512) >> 10; + res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12); +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND); +#endif // defined(MAX_BOUND) + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0)); + } + for(; i < WIDTH; ++i) + { + DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0)); + int res = (int)data * 1024 - mean; + res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1); + DATA_TYPE w = *((__global DATA_TYPE *)vector_offset(&weight, i)); + res = res * (int)w; + int b = *((__global int *)vector_offset(&bias, i)); + res = res + b; + // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024; + res = (res + 512) >> 10; + res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1); +#if defined(MIN_BOUND) + res = max(res, MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, MAX_BOUND); +#endif // defined(MAX_BOUND) + *((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res; + } +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp new file mode 100644 index 0000000000..187c517088 --- /dev/null +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input); + + const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size(); + /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */ + /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */ + const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration; + + // This kernel doesn't need padding + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} +Status validate_arguments(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Input tensor cannot have more than 2 dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weight->num_dimensions() > 1, "Weight tensor cannot have more than 1 dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Bias tensor cannot have more than 1 dimensions"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weight, 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + return Status{}; +} +} // namespace + +CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel() + : _input(nullptr), _weight(nullptr), _bias(nullptr), _output(nullptr) +{ +} + +void CLQLSTMLayerNormalizationKernel::configure(CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info())); + + _input = input; + _weight = weight; + _bias = bias; + _output = output; + + const uint32_t num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size(); + + int32_t output_multiplier{}; + int32_t output_shift{}; + const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform(); + const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); + output_shift *= -1; + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + + // Create kernel + _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "qlstm_layer_normalization_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias); +} + +Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + return Status{}; +} + +void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_2D(); + // Set slice step equal to width to force gws[0] to 1, as each thread normalizes across all rows + slice.set_dimension_step(Window::DimX, _input->info()->dimension(0)); + + Window weight_window; + Window weight_slice; + + weight_window.use_tensor_dimensions(_weight->info()->tensor_shape()); + weight_slice = weight_window.first_slice_window_1D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _weight, weight_slice); + add_1D_tensor_argument(idx, _bias, weight_slice); + add_2D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_2D(slice)); +} +} // namespace arm_compute diff --git a/tests/validation/CL/QLSTMLayerNormalization.cpp b/tests/validation/CL/QLSTMLayerNormalization.cpp new file mode 100644 index 0000000000..ea5eca6261 --- /dev/null +++ b/tests/validation/CL/QLSTMLayerNormalization.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" +#include "tests/CL/CLAccessor.h" +#include "tests/PaddingCalculator.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Helpers.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/QLSTMLayerNormalizationFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +constexpr AbsoluteTolerance tolerance_s16(0); /**< Tolerance value for comparing reference's output against implementation's output for QSYMM16 data types */ +constexpr uint32_t vector_size_byte = 16; + +using test::datasets::ShapeDataset; +template +class QLSTMLayerNormShapeDataSet : public ShapeDataset +{ + static constexpr auto boundary_minus_one = num_elements_per_iter * num_iteration - 1; + static constexpr auto boundary = num_elements_per_iter * num_iteration; + static constexpr auto boundary_plus_one = num_elements_per_iter * num_iteration + 1; + +public: + QLSTMLayerNormShapeDataSet(std::string name) + : ShapeDataset(name, + { + TensorShape{ boundary_minus_one, num_batches }, + TensorShape{ boundary, num_batches }, + TensorShape{ boundary_plus_one, num_batches } + }) + { + } +}; + +template +class QLSTMLayerNormShapeDataSet : public ShapeDataset +{ +public: + QLSTMLayerNormShapeDataSet(std::string name) + : ShapeDataset(name, + { + TensorShape{ 1, num_batches }, + TensorShape{ 2, num_batches } + }) + { + } +}; +} // namespace +TEST_SUITE(CL) +TEST_SUITE(QLSTMLayerNormalization) + +static const TensorShape correct_input_shape{ TensorShape(15U, 2U) }; +static const TensorShape correct_weight_shape{ TensorShape(15U) }; +static const TensorShape correct_bias_shape{ TensorShape(15U) }; +static const DataType correct_input_dt{ DataType::QSYMM16 }; +static const DataType correct_weight_dt{ DataType::QSYMM16 }; +static const DataType correct_bias_dt{ DataType::S32 }; +static const uint32_t tensor_num_channel{ 1 }; + +// *INDENT-OFF* +// clang-format off + +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, + zip(zip( + framework::dataset::make("InputInfo", { + TensorInfo(correct_input_shape, tensor_num_channel, DataType::F16), // input supports only QSYMM16 + TensorInfo(correct_input_shape, tensor_num_channel, correct_input_dt), // weight supports only QSYMM16 + TensorInfo(correct_input_shape, tensor_num_channel, correct_input_dt), // bias supports only S32 + TensorInfo(TensorShape(15U, 2U, 2U), tensor_num_channel, correct_input_dt), // input supports only up to 2D + TensorInfo(correct_input_shape, tensor_num_channel, correct_input_dt), // weight supports only up to 1D + TensorInfo(correct_input_shape, tensor_num_channel, correct_input_dt), // bias supports only up to 1D + TensorInfo(correct_input_shape, tensor_num_channel, correct_input_dt), // input_shape[0] != weight_shape[0] should fail + TensorInfo(correct_input_shape, tensor_num_channel, correct_input_dt), // weight_shape[0] != bias_shape[0] should fail + }), + framework::dataset::make("WeightInfo", { + TensorInfo(correct_weight_shape, tensor_num_channel, correct_weight_dt), + TensorInfo(correct_weight_shape, tensor_num_channel, DataType::F16), + TensorInfo(correct_weight_shape, tensor_num_channel, correct_weight_dt), + TensorInfo(correct_weight_shape, tensor_num_channel, correct_weight_dt), + TensorInfo(TensorShape(15U, 2U), tensor_num_channel, correct_weight_dt), + TensorInfo(correct_weight_shape, tensor_num_channel, correct_weight_dt), + TensorInfo(TensorShape(14U), tensor_num_channel, correct_weight_dt), + TensorInfo(correct_weight_shape, tensor_num_channel, correct_weight_dt), + }) + ), + framework::dataset::make("BiasInfo", { + TensorInfo(correct_bias_shape, tensor_num_channel, correct_bias_dt), + TensorInfo(correct_bias_shape, tensor_num_channel, correct_bias_dt), + TensorInfo(correct_bias_shape, tensor_num_channel, DataType::QSYMM16), + TensorInfo(correct_bias_shape, tensor_num_channel, correct_bias_dt), + TensorInfo(correct_bias_shape, tensor_num_channel, correct_bias_dt), + TensorInfo(TensorShape(15U, 2U), tensor_num_channel, correct_bias_dt), + TensorInfo(correct_bias_shape, tensor_num_channel, correct_bias_dt), + TensorInfo(TensorShape(14U), tensor_num_channel, correct_bias_dt), + }) + ), input_info, weight_info, bias_info) +{ + TensorInfo dummy_output{}; + const Status s = CLQLSTMLayerNormalizationKernel::validate(&input_info, &dummy_output, &weight_info, &bias_info); + ARM_COMPUTE_EXPECT(!bool(s), framework::LogLevel::ERRORS); +} + +// clang-format on +// *INDENT-ON* + +template +using CLQLSTMLayerNormalizationFixture = CLQLSTMLayerNormalizationValidationFixture; + +TEST_SUITE(Quantized) +TEST_SUITE(QSYMM16) + +/** Tests will be targetting + * - Comparison between OpenCL kernel and the exact same but scalar version of reference kernel + * - Input shapes of 1D and 2D with the first dimension covers boundary values of 128-bit vector size (0~3 iterations) + * - Weight and bias 1D shape that have same size as that of input shapes + * - Quantization scale is greater and smaller than one. + * - Input values will be noted in fixture. + * + * What we can't test + * - Since reference kernel uses the exact the same algorithm in the same quantized domain + * it is hard to fully test whether the algorithm accomplishes what it is supposed to. + * - The algorithm has been sensitive to quantization scale but it is hard to fully test + * the sensitivity due to aforementioned reason. + * - Again, it is hard to fully test corner values due to the exact same algorithm of the + * reference kernel and the OpenCL kernel. + */ + +constexpr uint32_t qsymm16_per_vector = vector_size_byte / sizeof(int16_t); + +#define QSYMM16_DATASET_ITER(num_input_batch, num_iter) \ + combine(combine(zip(zip(QLSTMLayerNormShapeDataSet("InputShape"), \ + QLSTMLayerNormShapeDataSet("WeightShape")), \ + QLSTMLayerNormShapeDataSet("BiasShape")), \ + framework::dataset::make("DataType", DataType::QSYMM16)), \ + framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1. / 8192), QuantizationInfo(8192) })) + +#define QSYMM16_DATASET_1D \ + concat(concat(QSYMM16_DATASET_ITER(1, 0), QSYMM16_DATASET_ITER(1, 1)), QSYMM16_DATASET_ITER(1, 2)) + +#define QSYMM16_DATASET_2D \ + concat(concat(QSYMM16_DATASET_ITER(3, 0), QSYMM16_DATASET_ITER(3, 1)), QSYMM16_DATASET_ITER(3, 2)) + +FIXTURE_DATA_TEST_CASE(RandomValue1D, CLQLSTMLayerNormalizationFixture, framework::DatasetMode::ALL, QSYMM16_DATASET_1D) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_s16); +} + +FIXTURE_DATA_TEST_CASE(RandomValue2D, CLQLSTMLayerNormalizationFixture, framework::DatasetMode::ALL, QSYMM16_DATASET_2D) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_s16); +} + +#undef QSYMM16_DATASET_ITER +#undef QSYMM16_DATASET_2D +#undef QSYMM16_DATASET_1D + +TEST_SUITE_END() // QSYMM16 +TEST_SUITE_END() // Quantized +TEST_SUITE_END() // QLSTMLayerNormalization +TEST_SUITE_END() // CL + +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp index 8508a6e483..248bf5cf78 100644 --- a/tests/validation/NEON/QLSTMLayerNormalization.cpp +++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp @@ -158,7 +158,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, // *INDENT-ON* template -using NEQLSTMLayerNormalizationFixture = QLSTMLayerNormalizationValidationFixture; +using NEQLSTMLayerNormalizationFixture = NEQLSTMLayerNormalizationValidationFixture; TEST_SUITE(Quantized) TEST_SUITE(QSYMM16) diff --git a/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h b/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h index 5d2cd2bd55..72af9d9241 100644 --- a/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h +++ b/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h @@ -26,6 +26,9 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" +#ifdef ARM_COMPUTE_CL +#include "arm_compute/runtime/CL/CLScheduler.h" +#endif /* ARM_COMPUTE_CL */ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "tests/AssetsLibrary.h" #include "tests/Globals.h" @@ -98,6 +101,8 @@ protected: } } + virtual void run_target(FunctionType &fn) = 0; + TensorType compute_target(const TensorShape &input_shape, const TensorShape &weight_shape, const TensorShape &bias_shape) { TensorType input = create_tensor(input_shape, _data_type, 1); @@ -110,9 +115,7 @@ protected: allocate_tensors({ &input, &weight, &bias, &output }); fill(AccessorType(input), AccessorType(weight), AccessorType(bias)); - ThreadInfo tinfo; - tinfo.cpu_info = &NEScheduler::get().cpu_info(); - fn.run(fn.window(), tinfo); + run_target(fn); return output; } @@ -136,6 +139,31 @@ protected: QuantizationInfo _qinfo{}; }; +template +class NEQLSTMLayerNormalizationValidationFixture : public QLSTMLayerNormalizationValidationFixture +{ +protected: + void run_target(FunctionType &fn) override + { + ThreadInfo tinfo; + tinfo.cpu_info = &NEScheduler::get().cpu_info(); + fn.run(fn.window(), tinfo); + } +}; + +#ifdef ARM_COMPUTE_CL +template +class CLQLSTMLayerNormalizationValidationFixture : public QLSTMLayerNormalizationValidationFixture +{ +protected: + void run_target(FunctionType &fn) override + { + CLScheduler::get().default_init(); + fn.run(fn.window(), CLScheduler::get().queue()); + } +}; +#endif /* ARM_COMPUTE_CL */ + } // namespace validation } // namespace test } // namespace arm_compute -- cgit v1.2.1