diff options
Diffstat (limited to 'src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp')
-rw-r--r-- | src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp | 123 |
1 files changed, 68 insertions, 55 deletions
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp index a88b193b31..8e1ed3a2a5 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,17 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/NESymm.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" #include <map> @@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 const int64_t b_3 = vgetlane(b_high, 1); int64x2x2_t result; - const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 }; - const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 }; + const int64x2_t result_0{a_0 * b_0, a_1 * b_1}; + const int64x2_t result_1{a_2 * b_2, a_3 * b_3}; result.val[0] = vadd(vmovl(vgetlow(bias)), result_0); result.val[1] = vadd(vmovl(vgethigh(bias)), result_1); @@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 } } // namespace -void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias) +void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *weight, + const ITensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); ARM_COMPUTE_ERROR_ON(input == output); ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info())); - static const std::map<DataType, ComputeFuncType> fn_map = - { - { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) }, + static const std::map<DataType, ComputeFuncType> fn_map = { + {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)}, }; _input = input; @@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o _output->info()->set_quantization_info(compute_output_qinfo()); const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform(); - const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); + const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); _output_shift *= -1; - if(!bool(s)) + if (!bool(s)) { _output_multiplier = 0; _output_shift = 0; @@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target) return window; } -Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_UNUSED(output, bias, weight, input); @@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16( using AccType = int64_t; using InputDataType = int16_t; - AccType sum{ 0 }; - AccType sum_sq{ 0 }; + AccType sum{0}; + AccType sum_sq{0}; int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { using namespace wrapper; const int16x8_t val = vloadq(input_ptr + x); @@ -200,6 +205,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16( sum_sq += static_cast<AccType>(vaddv(vmul(val_low, val_low))); sum_sq += static_cast<AccType>(vaddv(vmul(val_high, val_high))); #else // __aarch64__ + // only AArch64 supports vaddv const int64x2_t pair_sum_low = vpaddl(val_low); const int64x2_t pair_sum_high = vpaddl(val_high); @@ -215,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16( #endif // __aarch64__ } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { const InputDataType val = input_ptr[x]; sum += static_cast<AccType>(val); @@ -229,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift) + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift) { using OutputDataType = int16_t; @@ -237,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{}); int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { const int16x8_t val = vloadq(input_ptr + x); int32x4x2_t shifted; @@ -266,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i vstore(output_ptr + x + 4, vqmovn(out_val.val[1])); } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { - const auto val = static_cast<int32_t>(input_ptr[x]); - const int32_t shifted = (val << 10) - mean; - const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); - const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; + const auto val = static_cast<int32_t>(input_ptr[x]); + const int32_t shifted = (val << 10) - mean; + const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); + const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; const auto reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10); - int32_t out_val = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); - out_val = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min()); - output_ptr[x] = static_cast<OutputDataType>(out_val); + int32_t out_val = + quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); + out_val = + utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min()); + output_ptr[x] = static_cast<OutputDataType>(out_val); } } @@ -286,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16() using BiasDataType = int32_t; using AccType = int64_t; - Iterator input_iterator{ _input, _inout_window }; - Iterator output_iterator{ _output, _inout_window }; - Iterator weight_iterator{ _weight, _weight_window }; - Iterator bias_iterator{ _bias, _weight_window }; + Iterator input_iterator{_input, _inout_window}; + Iterator output_iterator{_output, _inout_window}; + Iterator weight_iterator{_weight, _weight_window}; + Iterator bias_iterator{_bias, _weight_window}; const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr()); const auto bias_ptr = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr()); const uint32_t column_size = _input->info()->tensor_shape()[0]; - execute_window_loop(_inout_window, [ &, this](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr()); - auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr()); - - AccType sum{ 0 }; - AccType sum_sq{ 0 }; - std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); - - AccType mean{ 0 }; - AccType variance{ 0 }; - std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); - - int32_t stddev_invsqrt_mul{}; - int32_t stddev_invsqrt_shift{}; - quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift); - - normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); - }, - input_iterator, output_iterator); + execute_window_loop( + _inout_window, + [&, this](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr()); + auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr()); + + AccType sum{0}; + AccType sum_sq{0}; + std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); + + AccType mean{0}; + AccType variance{0}; + std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); + + int32_t stddev_invsqrt_mul{}; + int32_t stddev_invsqrt_shift{}; + quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, + stddev_invsqrt_shift); + + normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); + }, + input_iterator, output_iterator); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute |