From 6b77e917801b4e979796ea75c538eef740482089 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Fri, 17 Nov 2017 09:27:57 +0000 Subject: COMPMID-665 - NEON: Add QASYMM8 in place Activation layer - Added min and max arguments for QuantizeDownInt32ToUint8Scale in order to apply bounded relu - Added support for int32_t biases - Extended tests Change-Id: I015dae17faa7284766b5435ca33bcf593c1b2b69 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96512 Reviewed-by: Georgios Pinitas Reviewed-by: Anthony Barbier Tested-by: Kaizen --- .../kernels/NEGEMMLowpOffsetContributionKernel.h | 7 + ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h | 39 +++- .../runtime/NEON/functions/NEGEMMLowpOutputStage.h | 15 +- ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 225 +++++++++++++++------ .../NEON/functions/NEGEMMLowpOutputStage.cpp | 4 +- tests/validation/CPP/GEMMLowp.cpp | 54 ++++- tests/validation/CPP/GEMMLowp.h | 9 +- tests/validation/NEON/GEMMLowp.cpp | 66 +++++- tests/validation/fixtures/GEMMLowpFixture.h | 55 +++-- 9 files changed, 370 insertions(+), 104 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h index 04b84339b0..8c1bae9396 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h @@ -35,6 +35,13 @@ class ITensor; * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel), * and adds to it the offset contribution of matrix A and matrix B in-place. * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * */ class NEGEMMLowpOffsetContributionKernel : public INEKernel { diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h index 65f1042b9c..4ec0e9df93 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -36,7 +36,10 @@ class ITensor; * The following computations will be performed by the kernel: * * -# Add offset terms to final result - * -# Multiply each entry of result and round to nearest integer + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. * */ @@ -56,22 +59,44 @@ public: /** Initialise the kernel's input and output. * * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 * @param[in] result_offset Offset to be added to each element of the input matrix * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ - void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift); + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: - const ITensor *_input; - ITensor *_output; - int32_t _result_offset; - int32_t _result_mult_int; - int32_t _result_shift; + /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template + void run(const Window &window); + + /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::*)(const Window &window); + + QuantizeDownFunctionPtr _func; + const ITensor *_input; + const ITensor *_bias; + ITensor *_output; + int _result_offset; + int _result_mult_int; + int _result_shift; + int _min; + int _max; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h index 8557ef42e1..a3db23aaee 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -43,14 +43,18 @@ class ITensor; * NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift * The final result is: * - * ((input[i][k] + result_offset) * result_mult_int + rounding) >> result_shift + * ((input[i][k] + result_offset) * result_mult_int) >> result_shift * - * where rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)) + * In case the bias tensor is provided, the final result is: + * + * ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift * * This function calls the following NEON kernels: * * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * before the result is shifted right by result_shift */ class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction { @@ -58,12 +62,17 @@ public: /** Initialise the kernel's inputs, output * * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 * @param[in] result_offset Offset to be added to each element of the input matrix * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ - void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift); + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp index aa3c280788..26aaa2a9d5 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -23,10 +23,12 @@ */ #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" @@ -36,26 +38,173 @@ using namespace arm_compute; +namespace +{ +inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) +{ + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); + + // Multiply by result_mult_int + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int); +} + +template +inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to U8 + uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_u8 = vmaxq_u8(out_u8, min_u8); + out_u8 = vminq_u8(out_u8, max_u8); + } + + return out_u8; +} +} // namespace + namespace arm_compute { class Coordinates; } // namespace arm_compute +template +void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window) +{ + const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift); + const uint8x16_t min_u8 = vdupq_n_u8(static_cast(_min)); + const uint8x16_t max_u8 = vdupq_n_u8(static_cast(_max)); + + ARM_COMPUTE_UNUSED(min_u8); + ARM_COMPUTE_UNUSED(max_u8); + + Iterator in(_input, window); + Iterator out(_output, window); + + if(_bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step())); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias(_bias, win_biases); + execute_window_loop(window, [&](const Coordinates & id) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(in.ptr()) + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + 12) + } + }; + + const int32x4x4_t bias_s32 = + { + { + vld1q_s32(reinterpret_cast(bias.ptr()) + 0), + vld1q_s32(reinterpret_cast(bias.ptr()) + 4), + vld1q_s32(reinterpret_cast(bias.ptr()) + 8), + vld1q_s32(reinterpret_cast(bias.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _result_mult_int); + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_u8(out.ptr(), finalize_quantization(in_s32, result_shift_s32, min_u8, max_u8)); + }, + in, bias, out); + } + else + { + execute_window_loop(window, [&](const Coordinates & id) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast(in.ptr()) + 0), + vld1q_s32(reinterpret_cast(in.ptr()) + 4), + vld1q_s32(reinterpret_cast(in.ptr()) + 8), + vld1q_s32(reinterpret_cast(in.ptr()) + 12) + } + }; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _result_mult_int); + + vst1q_u8(out.ptr(), finalize_quantization(in_s32, result_shift_s32, min_u8, max_u8)); + }, + in, out); + } +} + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel() - : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0) + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0), _min(0), _max(0) { } -void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift) +void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(max > 255); + ARM_COMPUTE_ERROR_ON(min < 0 || min > max); + + if(bias != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0)); + } _input = input; + _bias = bias; _output = output; _result_offset = result_offset; _result_mult_int = result_mult_int; _result_shift = result_shift; + _min = min; + _max = max; constexpr unsigned int num_elems_processed_per_iteration = 16; @@ -69,9 +218,22 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp input_access, output_result_access); + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]); + + update_window_and_padding(win, + bias_access); + } + output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); INEKernel::configure(win); + + const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255)); + + // Check if we need to clamp the result using min and max + _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run; } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info) @@ -80,62 +242,5 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, co ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift); - const int32x4_t zero_s32 = vdupq_n_s32(0); - - Iterator in(_input, window); - Iterator out(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); - - // Multiply by c_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); - - // Saturate negative values - in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); - in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); - in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); - in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to U8 - const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - - vst1q_u8(out.ptr(), out_u8); - }, - in, out); + (this->*_func)(window); } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index d09827f908..66cdf58634 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -29,9 +29,9 @@ using namespace arm_compute; -void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift) +void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, result_offset, result_mult_int, result_shift); + k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max); _kernel = std::move(k); } \ No newline at end of file diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp index 8670a22a66..bf002cf2b5 100644 --- a/tests/validation/CPP/GEMMLowp.cpp +++ b/tests/validation/CPP/GEMMLowp.cpp @@ -33,6 +33,36 @@ namespace validation { namespace reference { +namespace +{ +template +void quantize_down_int32_to_uint8_scale(const SimpleTensor *in, const SimpleTensor *bias, SimpleTensor *dst, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, + int32_t min, int32_t max) +{ + const int cols_in = in->shape().x(); + + for(int i = 0; i < in->num_elements(); ++i) + { + int32_t result = ((*in)[i] + result_offset) * result_mult_int; + + if(bias != nullptr) + { + result += (*bias)[i % cols_in]; + } + + result >>= result_shift; + + // Bounded ReLu + if(min != max) + { + result = std::max(min, std::min(max, result)); + } + + (*dst)[i] = static_cast(std::max(0, std::min(255, result))); + } +} +} // namespace + template SimpleTensor gemmlowp_matrix_multiply_core(const SimpleTensor &a, const SimpleTensor &b, int32_t a_offset, int32_t b_offset) { @@ -80,21 +110,31 @@ SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor } template -SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) +SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max) { SimpleTensor dst(in.shape(), DataType::QASYMM8); - for(int i = 0; i < in.num_elements(); ++i) - { - const int32_t result = ((in[i] + result_offset) * result_mult_int) >> result_shift; - dst[i] = static_cast(std::max(0, std::min(255, result))); - } + quantize_down_int32_to_uint8_scale(&in, nullptr, &dst, result_offset, result_mult_int, result_shift, min, max); + + return dst; +} + +template +SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, const SimpleTensor &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, + int32_t min, int32_t max) +{ + SimpleTensor dst(in.shape(), DataType::QASYMM8); + + quantize_down_int32_to_uint8_scale(&in, &bias, &dst, result_offset, result_mult_int, result_shift, min, max); return dst; } template SimpleTensor gemmlowp_matrix_multiply_core(const SimpleTensor &a, const SimpleTensor &b, int32_t a_offset, int32_t b_offset); -template SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift); +template SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, + int32_t max); +template SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &a, const SimpleTensor &b, int32_t result_offset, int32_t result_mult_int, + int32_t result_shift, int32_t min, int32_t max); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h index cbed2206e3..ee33d8e0c0 100644 --- a/tests/validation/CPP/GEMMLowp.h +++ b/tests/validation/CPP/GEMMLowp.h @@ -35,14 +35,17 @@ namespace validation { namespace reference { +SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b); + template SimpleTensor gemmlowp_matrix_multiply_core(const SimpleTensor &a, const SimpleTensor &b, int32_t a_offset, int32_t b_offset); template -SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift); - -SimpleTensor gemmlowp(const SimpleTensor &a, const SimpleTensor &b); +SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0); +template +SimpleTensor gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor &in, const SimpleTensor &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, + int32_t min = 0, int32_t max = 0); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index ba91ced443..078096a0dd 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -131,34 +131,55 @@ TEST_SUITE(OutputStage) TEST_SUITE(QuantizeDownInt32ToUint8Scale) -using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture; +const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, + 3) + * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true }); + +const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, + 2) + * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true }); -const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -4, 4) * framework::dataset::make("result_mult_int", 1, 3) * framework::dataset::make("result_shift", 2, - 4); +using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture; DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases), - shape, result_offset, result_mult_int, result_shift) + shape, result_offset, result_mult_int, result_shift, min, max, add_bias) { + TensorShape shape_bias(shape[0]); + // Create tensors - Tensor in = create_tensor(shape, DataType::S32); - Tensor out = create_tensor(shape, DataType::QASYMM8); + Tensor in = create_tensor(shape, DataType::S32); + Tensor bias = create_tensor(shape_bias, DataType::S32); + Tensor out = create_tensor(shape, DataType::QASYMM8); ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function NEGEMMLowpQuantizeDownInt32ToUint8Scale output_stage; - output_stage.configure(&in, &out, result_offset, result_mult_int, result_shift); + output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_offset, result_mult_int, result_shift, min, max); - // Validate valid region + // Validate valid region input and output const ValidRegion valid_region = shape_to_valid_region(shape); validate(in.info()->valid_region(), valid_region); validate(out.info()->valid_region(), valid_region); + // Validate valid region bias + if(add_bias) + { + const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias); + validate(bias.info()->valid_region(), valid_region_bias); + } + // Validate padding const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); validate(in.info()->padding(), padding); validate(out.info()->padding(), padding); + + if(add_bias) + { + validate(bias.info()->padding(), padding); + } } FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases)) @@ -173,8 +194,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, validate(Accessor(_target), _reference); } -TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale +TEST_SUITE(BoundedReLu) +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // BoundedReLu + +TEST_SUITE(AddBias) +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +{ + // Validate output + validate(Accessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +{ + // Validate output + validate(Accessor(_target), _reference); +} +TEST_SUITE_END() // AddBias + +TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale TEST_SUITE_END() // OutputStage TEST_SUITE_END() // GEMMLowp diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index f9b0dbd959..a99e9323c8 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -122,10 +122,10 @@ class GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture : public framework: { public: template - void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) + void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias) { - _target = compute_target(shape, result_offset, result_mult_int, result_shift); - _reference = compute_reference(shape, result_offset, result_mult_int, result_shift); + _target = compute_target(shape, result_offset, result_mult_int, result_shift, min, max, add_bias); + _reference = compute_reference(shape, result_offset, result_mult_int, result_shift, min, max, add_bias); } protected: @@ -136,43 +136,72 @@ protected: library->fill(tensor, distribution, i); } - TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) + TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias) { + TensorShape shape_bias(shape[0]); + // Create tensors TensorType a = create_tensor(shape, DataType::S32, 1); - TensorType b = create_tensor(shape, DataType::QASYMM8, 1); + TensorType b = create_tensor(shape_bias, DataType::S32, 1); + TensorType c = create_tensor(shape, DataType::QASYMM8, 1); // Create and configure function FunctionType output_stage; - output_stage.configure(&a, &b, result_offset, result_mult_int, result_shift); + output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_offset, result_mult_int, result_shift, min, max); ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS); // Allocate tensors a.allocator()->allocate(); - b.allocator()->allocate(); + c.allocator()->allocate(); ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS); - // Fill tensors + // Fill tensor fill(AccessorType(a), 0); + if(add_bias) + { + ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Allocate bias tensor + b.allocator()->allocate(); + + ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Fill tensor + fill(AccessorType(b), 1); + } + // Compute GEMM function output_stage.run(); - return b; + return c; } - SimpleTensor compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift) + SimpleTensor compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias) { // Create reference + TensorShape shape_bias(shape[0]); + SimpleTensor a{ shape, DataType::S32, 1 }; + SimpleTensor b{ shape_bias, DataType::S32, 1 }; // Fill reference fill(a, 0); - return reference::gemmlowp_quantize_down_int32_to_uint8_scale(a, result_offset, result_mult_int, result_shift); + if(add_bias) + { + // Fill bias + fill(b, 1); + + return reference::gemmlowp_quantize_down_int32_to_uint8_scale(a, b, result_offset, result_mult_int, result_shift, min, max); + } + else + { + return reference::gemmlowp_quantize_down_int32_to_uint8_scale(a, result_offset, result_mult_int, result_shift, min, max); + } } TensorType _target{}; -- cgit v1.2.1