From a5d61bf5cd566955f3902e07c43c5c1c059bf8e9 Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Thu, 17 Mar 2022 12:52:02 +0000 Subject: NEQLSTM: Add support for QASYMM8_SIGNED for input_to_forget_weights * QLSTM only supports QSYMM8 for the argument input_to_forget_weights * We add support for QASYMM8_SIGNED by dequantizing and requantizing to QSYMM8 * Resolves COMPMID-5184 Change-Id: I1cae18d81dafdb7ae722b520a1354cf4a56b9606 Signed-off-by: Pablo Marquez Tello Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7321 Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins (cherry picked from commit 187a041dedf8e9db0c9e0652f13f8639dca880f3) --- arm_compute/runtime/NEON/functions/NEQLSTMLayer.h | 11 +- src/cpu/kernels/CpuQuantizeKernel.cpp | 44 ++++++- src/cpu/kernels/CpuQuantizeKernel.h | 3 + src/runtime/NEON/functions/NEQLSTMLayer.cpp | 134 ++++++++++++++++++---- 4 files changed, 168 insertions(+), 24 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index acbd92cff7..185d821ec0 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,9 +29,11 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" +#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" +#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/runtime/common/LSTMParams.h" @@ -255,6 +257,9 @@ private: }; // Functions used + + NEDequantizationLayer _dequantize_input_to_forget_weights; + NEQuantizationLayer _quantize_input_to_forget_weights; NETranspose _transpose_input_to_forget_weights; NETranspose _transpose_input_to_cell_weights; NETranspose _transpose_input_to_output_weights; @@ -381,6 +386,9 @@ private: static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias); // Temporary tensors + Tensor _input_to_forget_weights_f32{ nullptr }; + Tensor _input_to_forget_weights_symm8{ nullptr }; + Tensor _input_to_forget_weights_transposed{ nullptr }; Tensor _input_to_cell_weights_transposed{ nullptr }; Tensor _input_to_output_weights_transposed{ nullptr }; @@ -449,6 +457,7 @@ private: bool _has_peephole{ false }; bool _has_layer_norm{ false }; bool _projection_tensor_copy_required{ false }; + bool _convert_input_to_forget_weights_to_qsymm8{ false }; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */ diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp index ecae5e7b4e..9700c62318 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.cpp +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); return Status{}; @@ -123,6 +123,8 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, + { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8 }, + { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, @@ -157,6 +159,42 @@ Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds return Status{}; } +template +void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if(is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + int x = window_start_x; + for(; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); + } + }, + input, output); +} + template void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) { @@ -263,4 +301,4 @@ const char *CpuQuantizeKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index 28690bea54..2bc8105a11 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -81,6 +81,9 @@ private: template void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window); + template + void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window); + QuantizeFunctionExecutorPtr _func{ nullptr }; }; } // namespace kernels diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 76bb8c01d2..c6e6a71cb7 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -111,17 +111,81 @@ void NEQLSTMLayer::TensorCopyKernel::run() NEQLSTMLayer::~NEQLSTMLayer() = default; NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr memory_manager) - : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(), - _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(), - _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(), - _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(), - _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(), - _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(), - _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(), - _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(), - _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(), - _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(), - _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(), + : _memory_group(), + _dequantize_input_to_forget_weights(), + _quantize_input_to_forget_weights(), + _transpose_input_to_forget_weights(), + _transpose_input_to_cell_weights(), + _transpose_input_to_output_weights(), + _transpose_input_to_input_weights(), + _transpose_recurrent_to_forget_weights(), + _transpose_recurrent_to_cell_weights(), + _transpose_recurrent_to_output_weights(), + _transpose_recurrent_to_input_weights(), + _transpose_projection_weights(), + _input_to_input_reduction(), + _recurrent_to_input_reduction(), + _input_to_forget_reduction(), + _recurrent_to_forget_reduction(), + _input_to_cell_reduction(), + _recurrent_to_cell_reduction(), + _input_to_output_reduction(), + _recurrent_to_output_reduction(), + _projection_reduction(), + _projection_bias_add(), + _mm_input_to_forget(), + _mm_recurrent_to_forget(), + _pixelwise_mul_cell_to_forget(), + _input_to_forget_outstage(), + _recurrent_to_forget_outstage(), + _cell_to_forget_outstage(), + _accumulate_input_recurrent_forget(), + _accumulate_cell_forget(), + _forget_gate_sigmoid(), + _mm_input_to_cell(), + _input_to_cell_outstage(), + _mm_recurrent_to_cell(), + _recurrent_to_cell_outstage(), + _accumulate_input_recurrent_modulation(), + _cell_gate_tanh(), + _input_gate_sub(), + _mm_input_to_input(), + _input_to_input_outstage(), + _mm_recurrent_to_input(), + _recurrent_to_input_outstage(), + _accumulate_input_recurrent_input(), + _pixelwise_mul_cell_to_input(), + _cell_to_input_outstage(), + _accumulate_cell_input(), + _input_gate_sigmoid(), + _pixelwise_mul_forget_cell(), + _pixelwise_mul_input_cell(), + _add_forget_cell(), + _cell_clip(), + _mm_input_to_output(), + _input_to_output_outstage(), + _mm_recurrent_to_output(), + _recurrent_to_output_outstage(), + _accumulate_input_recurrent_output(), + _pixelwise_mul_cell_to_output(), + _cell_to_output_outstage(), + _accumulate_cell_to_output(), + _output_gate_sigmoid(), + _hidden_tanh(), + _pixelwise_mul_hidden(), + _hidden_outstage(), + _mm_projection(), + _projection_outstage(), + _accumulate_projection(), + _projection_clip(), + _projection_bias_copy(), + _projection_output_to_accumulate_copy(), + _projection_accumulate_to_output_copy(), + _hidden_to_output_copy(), + _layer_norms(), + _copy_output(), + _layer_norm_weights(), + _layer_norm_bias(), _layer_norm_output() { _memory_group = MemoryGroup(std::move(memory_manager)); @@ -174,12 +238,37 @@ void NEQLSTMLayer::configure(const ITensor *input, _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info()); _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info()); - // Validate - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + { + _convert_input_to_forget_weights_to_qsymm8 = true; + // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32 + + _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) + .set_data_layout(input_to_forget_weights->info()->data_layout())); + // Setup the quantize output tensor to go from F32 -> QSYMM8 + _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) + .set_data_layout(input_to_forget_weights->info()->data_layout()) + .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); + + _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32); + _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8); + _input_to_forget_weights_f32.allocator()->allocate(); + _input_to_forget_weights_symm8.allocator()->allocate(); + + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), + lstm_params_info)); + } + else + { + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), + lstm_params_info)); + } const int batch_size = input->info()->dimension(1); const int num_units = input_to_output_weights->info()->dimension(1); @@ -190,7 +279,7 @@ void NEQLSTMLayer::configure(const ITensor *input, const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform(); _projection_bias = lstm_params.projection_bias(); - _input_to_forget_weights = input_to_forget_weights; + _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights; _input_to_cell_weights = input_to_cell_weights; _input_to_output_weights = input_to_output_weights; _recurrent_to_forget_weights = recurrent_to_forget_weights; @@ -611,10 +700,9 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8,DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias); @@ -967,6 +1055,12 @@ void NEQLSTMLayer::run() // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_memory_group); + if(_convert_input_to_forget_weights_to_qsymm8) + { + _dequantize_input_to_forget_weights.run(); + _quantize_input_to_forget_weights.run(); + } + // Forget gate. _mm_input_to_forget.run(); _input_to_forget_outstage.run(); -- cgit v1.2.1