NEQLSTM: Add support for QASYMM8_SIGNED for input_to_forget_weights

* QLSTM only supports QSYMM8 for the argument input_to_forget_weights * We add support for QASYMM8_SIGNED by dequantizing and requantizing to QSYMM8 * Resolves COMPMID-5184 Change-Id: I1cae18d81dafdb7ae722b520a1354cf4a56b9606 Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7321 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> (cherry picked from commit 187a041dedf8e9db0c9e0652f13f8639dca880f3)
author: Pablo Marquez Tello <pablo.tello@arm.com> 2022-03-17 12:52:02 +0000
committer: ramelg01 <ramy.elgammal@arm.com> 2022-03-29 10:26:17 +0100
commit: a5d61bf5cd566955f3902e07c43c5c1c059bf8e9 (patch)
tree: 7a5e4853d6998c207ca4f333df707d64068bb790
parent: b83e67238bf84c5780f9d27c87cf30342099b291 (diff)
download: ComputeLibrary-a5d61bf5cd566955f3902e07c43c5c1c059bf8e9.tar.gz
4 files changed, 168 insertions, 24 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index acbd92cff7..185d821ec0 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,11 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
@@ -255,6 +257,9 @@ private:
     };
 
     // Functions used
+
+    NEDequantizationLayer                                            _dequantize_input_to_forget_weights;
+    NEQuantizationLayer                                              _quantize_input_to_forget_weights;
     NETranspose                                                      _transpose_input_to_forget_weights;
     NETranspose                                                      _transpose_input_to_cell_weights;
     NETranspose                                                      _transpose_input_to_output_weights;
@@ -381,6 +386,9 @@ private:
     static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
+    Tensor _input_to_forget_weights_f32{ nullptr };
+    Tensor _input_to_forget_weights_symm8{ nullptr };
+
     Tensor _input_to_forget_weights_transposed{ nullptr };
     Tensor _input_to_cell_weights_transposed{ nullptr };
     Tensor _input_to_output_weights_transposed{ nullptr };
@@ -449,6 +457,7 @@ private:
     bool _has_peephole{ false };
     bool _has_layer_norm{ false };
     bool _projection_tensor_copy_required{ false };
+    bool _convert_input_to_forget_weights_to_qsymm8{ false };
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index ecae5e7b4e..9700c62318 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -123,6 +123,8 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
         { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
         { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
 
+        { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> },
+
         { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
         { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
         { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
@@ -158,6 +160,42 @@ Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds
 }
 
 template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step); x += window_step)
+        {
+            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+        }
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+        }
+    },
+    input, output);
+}
+
+template <typename TIn, typename TOut>
 void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -263,4 +301,4 @@ const char *CpuQuantizeKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index 28690bea54..2bc8105a11 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -81,6 +81,9 @@ private:
     template <typename T>
     void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
 
+    template <typename TIn, typename TOut>
+    void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
+
     QuantizeFunctionExecutorPtr _func{ nullptr };
 };
 } // namespace kernels
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 76bb8c01d2..c6e6a71cb7 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -111,17 +111,81 @@ void NEQLSTMLayer::TensorCopyKernel::run()
 NEQLSTMLayer::~NEQLSTMLayer() = default;
 
 NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(),
-      _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(),
-      _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(),
-      _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(),
-      _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(),
-      _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(),
-      _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(),
-      _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(),
-      _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(),
-      _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(),
-      _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(),
+    : _memory_group(),
+      _dequantize_input_to_forget_weights(),
+      _quantize_input_to_forget_weights(),
+      _transpose_input_to_forget_weights(),
+      _transpose_input_to_cell_weights(),
+      _transpose_input_to_output_weights(),
+      _transpose_input_to_input_weights(),
+      _transpose_recurrent_to_forget_weights(),
+      _transpose_recurrent_to_cell_weights(),
+      _transpose_recurrent_to_output_weights(),
+      _transpose_recurrent_to_input_weights(),
+      _transpose_projection_weights(),
+      _input_to_input_reduction(),
+      _recurrent_to_input_reduction(),
+      _input_to_forget_reduction(),
+      _recurrent_to_forget_reduction(),
+      _input_to_cell_reduction(),
+      _recurrent_to_cell_reduction(),
+      _input_to_output_reduction(),
+      _recurrent_to_output_reduction(),
+      _projection_reduction(),
+      _projection_bias_add(),
+      _mm_input_to_forget(),
+      _mm_recurrent_to_forget(),
+      _pixelwise_mul_cell_to_forget(),
+      _input_to_forget_outstage(),
+      _recurrent_to_forget_outstage(),
+      _cell_to_forget_outstage(),
+      _accumulate_input_recurrent_forget(),
+      _accumulate_cell_forget(),
+      _forget_gate_sigmoid(),
+      _mm_input_to_cell(),
+      _input_to_cell_outstage(),
+      _mm_recurrent_to_cell(),
+      _recurrent_to_cell_outstage(),
+      _accumulate_input_recurrent_modulation(),
+      _cell_gate_tanh(),
+      _input_gate_sub(),
+      _mm_input_to_input(),
+      _input_to_input_outstage(),
+      _mm_recurrent_to_input(),
+      _recurrent_to_input_outstage(),
+      _accumulate_input_recurrent_input(),
+      _pixelwise_mul_cell_to_input(),
+      _cell_to_input_outstage(),
+      _accumulate_cell_input(),
+      _input_gate_sigmoid(),
+      _pixelwise_mul_forget_cell(),
+      _pixelwise_mul_input_cell(),
+      _add_forget_cell(),
+      _cell_clip(),
+      _mm_input_to_output(),
+      _input_to_output_outstage(),
+      _mm_recurrent_to_output(),
+      _recurrent_to_output_outstage(),
+      _accumulate_input_recurrent_output(),
+      _pixelwise_mul_cell_to_output(),
+      _cell_to_output_outstage(),
+      _accumulate_cell_to_output(),
+      _output_gate_sigmoid(),
+      _hidden_tanh(),
+      _pixelwise_mul_hidden(),
+      _hidden_outstage(),
+      _mm_projection(),
+      _projection_outstage(),
+      _accumulate_projection(),
+      _projection_clip(),
+      _projection_bias_copy(),
+      _projection_output_to_accumulate_copy(),
+      _projection_accumulate_to_output_copy(),
+      _hidden_to_output_copy(),
+      _layer_norms(),
+      _copy_output(),
+      _layer_norm_weights(),
+      _layer_norm_bias(),
       _layer_norm_output()
 {
     _memory_group = MemoryGroup(std::move(memory_manager));
@@ -174,12 +238,37 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info());
     _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info());
 
-    // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+    {
+        _convert_input_to_forget_weights_to_qsymm8 = true;
+        // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
+
+        _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+                                                       .set_data_layout(input_to_forget_weights->info()->data_layout()));
+        // Setup the quantize output tensor to go from F32 -> QSYMM8
+        _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+                                                          .set_data_layout(input_to_forget_weights->info()->data_layout())
+                                                          .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+
+        _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
+        _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
+        _input_to_forget_weights_f32.allocator()->allocate();
+        _input_to_forget_weights_symm8.allocator()->allocate();
+
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
+                                                          lstm_params_info));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
+                                                          lstm_params_info));
+    }
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -190,7 +279,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
     const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
 
     _projection_bias             = lstm_params.projection_bias();
-    _input_to_forget_weights     = input_to_forget_weights;
+    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights;
     _input_to_cell_weights       = input_to_cell_weights;
     _input_to_output_weights     = input_to_output_weights;
     _recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -611,10 +700,9 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8,DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);
@@ -967,6 +1055,12 @@ void NEQLSTMLayer::run()
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_memory_group);
 
+    if(_convert_input_to_forget_weights_to_qsymm8)
+    {
+        _dequantize_input_to_forget_weights.run();
+        _quantize_input_to_forget_weights.run();
+    }
+
     // Forget gate.
     _mm_input_to_forget.run();
     _input_to_forget_outstage.run();
author	Pablo Marquez Tello <pablo.tello@arm.com>	2022-03-17 12:52:02 +0000
committer	ramelg01 <ramy.elgammal@arm.com>	2022-03-29 10:26:17 +0100
commit	a5d61bf5cd566955f3902e07c43c5c1c059bf8e9 (patch)
tree	7a5e4853d6998c207ca4f333df707d64068bb790
parent	b83e67238bf84c5780f9d27c87cf30342099b291 (diff)
download	ComputeLibrary-a5d61bf5cd566955f3902e07c43c5c1c059bf8e9.tar.gz