From deb3ac461b2b8a4f85ff91b422b6e0ada3be1300 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 20 Dec 2019 10:02:17 +0000 Subject: COMPMID-2807: Add support for QASYMM8_SIGNED in NEGEMMMatrixVectorMultiplyKernel Change-Id: I8d33969dfc61c9a3793954cc12d22f24fb9117f0 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2513 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- .../kernels/NEGEMMMatrixVectorMultiplyKernel.h | 8 +- .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp | 104 ++++++++++++++++++--- 2 files changed, 94 insertions(+), 18 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h index 63b42aae26..f5635dd58c 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h @@ -50,17 +50,17 @@ public: NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32 + * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 * @param[in] input1 Second Input tensor. Data types supported: same as @p input. - * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input. + * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input. */ void configure(const ITensor *input0, const ITensor *input1, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel * - * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32 + * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 * @param[in] input1 Second Input tensor. Data types supported: same as @p input. - * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input. + * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input. * * @return a status */ diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp index 0e77ead72b..cf8411c55f 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp @@ -38,18 +38,23 @@ #include #include -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type())); + if(is_data_type_quantized_asymmetric(input0->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + } ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1)); @@ -87,8 +92,6 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &wind ARM_COMPUTE_UNUSED(window_out); } -namespace arm_compute -{ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, @@ -242,7 +245,79 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply +void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, + const Window &window_w, + const Window &window_out) +{ + Iterator in(_input0, window_in); + Iterator in2(_input1, window_w); + Iterator out(_output, window_out); + + const int input_offset = -_input0->info()->quantization_info().uniform().offset; + const int weights_offset = -_input1->info()->quantization_info().uniform().offset; + + const int input_w = _input0->info()->dimension(0); + const int input_h = _input0->info()->dimension(1); + const int input_stride_x = _input0->info()->strides_in_bytes().x(); + const int weights_stride_x = _input1->info()->strides_in_bytes().x(); + const int weights_stride_y = _input1->info()->strides_in_bytes().y(); + const int output_stride_x = _output->info()->strides_in_bytes().x(); + const int read_step = 16 / _input0->info()->element_size(); + + const int32x4_t v_input_offset = vdupq_n_s32(input_offset); + const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); + + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; + auto output_ptr = reinterpret_cast(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); + + int32x4_t row_dot = vdupq_n_s32(0); + for(int i = 0; i < input_w; i += read_step) + { + // Read values + const auto input = vld1q_s8(reinterpret_cast(input_ptr + i * input_stride_x)); + const auto weights = vld1q_s8(reinterpret_cast(weights_ptr + i * weights_stride_x)); + + // Add offsets + const int32x4x4_t input_s32 = + { + { + vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_low_s8(input)))), + vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_low_s8(input)))), + vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_high_s8(input)))), + vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_high_s8(input)))) + } + }; + const int32x4x4_t weights_s32 = + { + { + vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_low_s8(weights)))), + vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_low_s8(weights)))), + vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_high_s8(weights)))), + vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_high_s8(weights)))) + } + }; + + // Dot + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0])); + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1])); + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2])); + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3])); + } + + // Reduction + auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot)); + temp = vpadd_s32(temp, temp); + + *output_ptr = vget_lane_s32(temp, 0); + }, + in, in2, out); +} NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel() : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0) @@ -257,7 +332,6 @@ BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); _input0 = input0; @@ -270,6 +344,9 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT case DataType::QASYMM8: _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; break; + case DataType::QASYMM8_SIGNED: + _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; + break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply; @@ -306,6 +383,7 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); Window window_slice = window.first_slice_window_3D(); @@ -327,8 +405,6 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - if(_func != nullptr) - { - (this->*_func)(window_in, window_weights, window_out); - } + (this->*_func)(window_in, window_weights, window_out); } +} // namespace arm_compute -- cgit v1.2.1