From deb3ac461b2b8a4f85ff91b422b6e0ada3be1300 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Fri, 20 Dec 2019 10:02:17 +0000
Subject: COMPMID-2807: Add support for QASYMM8_SIGNED in
 NEGEMMMatrixVectorMultiplyKernel

Change-Id: I8d33969dfc61c9a3793954cc12d22f24fb9117f0
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2513
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.h     |   8 +-
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp   | 104 ++++++++++++++++++---
 2 files changed, 94 insertions(+), 18 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 63b42aae26..f5635dd58c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -50,17 +50,17 @@ public:
     NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
+     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
      */
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
      *
-     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[in] input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
+     * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
      *
      * @return a status
      */
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 0e77ead72b..cf8411c55f 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -38,18 +38,23 @@
 #include <cstdint>
 #include <tuple>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type()));
+    if(is_data_type_quantized_asymmetric(input0->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+    }
 
     ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
@@ -87,8 +92,6 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &wind
     ARM_COMPUTE_UNUSED(window_out);
 }
 
-namespace arm_compute
-{
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>(const Window &window_in,
@@ -242,7 +245,79 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t,
     },
     in, in2, out);
 }
-} //namespace arm_compute
+
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<int8_t, int8_t, int32_t>(const Window &window_in,
+                                                                                       const Window &window_w,
+                                                                                       const Window &window_out)
+{
+    Iterator in(_input0, window_in);
+    Iterator in2(_input1, window_w);
+    Iterator out(_output, window_out);
+
+    const int input_offset   = -_input0->info()->quantization_info().uniform().offset;
+    const int weights_offset = -_input1->info()->quantization_info().uniform().offset;
+
+    const int input_w          = _input0->info()->dimension(0);
+    const int input_h          = _input0->info()->dimension(1);
+    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
+    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+    const int output_stride_x  = _output->info()->strides_in_bytes().x();
+    const int read_step        = 16 / _input0->info()->element_size();
+
+    const int32x4_t v_input_offset   = vdupq_n_s32(input_offset);
+    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        // Get pointers
+        const uint8_t *const input_ptr   = in.ptr();
+        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+        auto                 output_ptr  = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+        int32x4_t row_dot = vdupq_n_s32(0);
+        for(int i = 0; i < input_w; i += read_step)
+        {
+            // Read values
+            const auto input   = vld1q_s8(reinterpret_cast<const int8_t *>(input_ptr + i * input_stride_x));
+            const auto weights = vld1q_s8(reinterpret_cast<const int8_t *>(weights_ptr + i * weights_stride_x));
+
+            // Add offsets
+            const int32x4x4_t input_s32 =
+            {
+                {
+                    vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_low_s8(input)))),
+                    vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_low_s8(input)))),
+                    vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_high_s8(input)))),
+                    vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_high_s8(input))))
+                }
+            };
+            const int32x4x4_t weights_s32 =
+            {
+                {
+                    vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_low_s8(weights)))),
+                    vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_low_s8(weights)))),
+                    vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_high_s8(weights)))),
+                    vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_high_s8(weights))))
+                }
+            };
+
+            // Dot
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0]));
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1]));
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2]));
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3]));
+        }
+
+        // Reduction
+        auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot));
+        temp      = vpadd_s32(temp, temp);
+
+        *output_ptr = vget_lane_s32(temp, 0);
+    },
+    in, in2, out);
+}
 
 NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
     : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0)
@@ -257,7 +332,6 @@ BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
 void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
@@ -270,6 +344,9 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
         case DataType::QASYMM8:
             _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
             break;
+        case DataType::QASYMM8_SIGNED:
+            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<int8_t, int8_t, int32_t>;
+            break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>;
@@ -306,6 +383,7 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     Window window_slice = window.first_slice_window_3D();
 
@@ -327,8 +405,6 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf
     window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
     window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
-    if(_func != nullptr)
-    {
-        (this->*_func)(window_in, window_weights, window_out);
-    }
+    (this->*_func)(window_in, window_weights, window_out);
 }
+} // namespace arm_compute
-- 
cgit v1.2.1