From 6b77e917801b4e979796ea75c538eef740482089 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Fri, 17 Nov 2017 09:27:57 +0000 Subject: COMPMID-665 - NEON: Add QASYMM8 in place Activation layer - Added min and max arguments for QuantizeDownInt32ToUint8Scale in order to apply bounded relu - Added support for int32_t biases - Extended tests Change-Id: I015dae17faa7284766b5435ca33bcf593c1b2b69 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96512 Reviewed-by: Georgios Pinitas Reviewed-by: Anthony Barbier Tested-by: Kaizen --- ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 225 +++++++++++++++------ 1 file changed, 165 insertions(+), 60 deletions(-) (limited to 'src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp') diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp index aa3c280788..26aaa2a9d5 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -23,10 +23,12 @@ */ #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" @@ -36,26 +38,173 @@ using namespace arm_compute; +namespace +{ +inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) +{ + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); + + // Multiply by result_mult_int + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int); +} + +template +inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to U8 + uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_u8 = vmaxq_u8(out_u8, min_u8); + out_u8 = vminq_u8(out_u8, max_u8); + } + + return out_u8; +} +} // namespace + namespace arm_compute { class Coordinates; } // namespace arm_compute +template +void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window) +{ + const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift); + const uint8x16_t min_u8 = vdupq_n_u8(static_cast(_min)); + const uint8x16_t max_u8 = vdupq_n_u8(static_cast(_max)); + + ARM_COMPUTE_UNUSED(min_u8); + ARM_COMPUTE_UNUSED(max_u8); + + Iterator in(_input, window); + Iterator out(_output, window); + + if(_bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step())); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias(_bias, win_biases); + execute_window_loop(window, [&](const Coordinates & id) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(in.ptr()) + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + 12) + } + }; + + const int32x4x4_t bias_s32 = + { + { + vld1q_s32(reinterpret_cast(bias.ptr()) + 0), + vld1q_s32(reinterpret_cast(bias.ptr()) + 4), + vld1q_s32(reinterpret_cast(bias.ptr()) + 8), + vld1q_s32(reinterpret_cast(bias.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _result_mult_int); + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_u8(out.ptr(), finalize_quantization(in_s32, result_shift_s32, min_u8, max_u8)); + }, + in, bias, out); + } + else + { + execute_window_loop(window, [&](const Coordinates & id) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(in.ptr()) + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _result_mult_int); + + vst1q_u8(out.ptr(), finalize_quantization(in_s32, result_shift_s32, min_u8, max_u8)); + }, + in, out); + } +} + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel() - : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0) + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0), _min(0), _max(0) { } -void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift) +void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(max > 255); + ARM_COMPUTE_ERROR_ON(min < 0 || min > max); + + if(bias != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0)); + } _input = input; + _bias = bias; _output = output; _result_offset = result_offset; _result_mult_int = result_mult_int; _result_shift = result_shift; + _min = min; + _max = max; constexpr unsigned int num_elems_processed_per_iteration = 16; @@ -69,9 +218,22 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp input_access, output_result_access); + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]); + + update_window_and_padding(win, + bias_access); + } + output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); INEKernel::configure(win); + + const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255)); + + // Check if we need to clamp the result using min and max + _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run; } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info) @@ -80,62 +242,5 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, co ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift); - const int32x4_t zero_s32 = vdupq_n_s32(0); - - Iterator in(_input, window); - Iterator out(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); - - // Multiply by c_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); - - // Saturate negative values - in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); - in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); - in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); - in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to U8 - const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - - vst1q_u8(out.ptr(), out_u8); - }, - in, out); + (this->*_func)(window); } \ No newline at end of file -- cgit v1.2.1