aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-11-17 09:27:57 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit6b77e917801b4e979796ea75c538eef740482089 (patch)
tree0e693ecb1eb0b05018901a992b56781a08b9c266
parentb3c81cb4100b3a449db5232364e18e649b26df58 (diff)
downloadComputeLibrary-6b77e917801b4e979796ea75c538eef740482089.tar.gz
COMPMID-665 - NEON: Add QASYMM8 in place Activation layer
- Added min and max arguments for QuantizeDownInt32ToUint8Scale in order to apply bounded relu - Added support for int32_t biases - Extended tests Change-Id: I015dae17faa7284766b5435ca33bcf593c1b2b69 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96512 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h7
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h39
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h15
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp225
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp4
-rw-r--r--tests/validation/CPP/GEMMLowp.cpp54
-rw-r--r--tests/validation/CPP/GEMMLowp.h9
-rw-r--r--tests/validation/NEON/GEMMLowp.cpp66
-rw-r--r--tests/validation/fixtures/GEMMLowpFixture.h55
9 files changed, 370 insertions, 104 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 04b84339b0..8c1bae9396 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -35,6 +35,13 @@ class ITensor;
* This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel),
* and adds to it the offset contribution of matrix A and matrix B in-place.
*
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (vector_sum_col[k] * a_offset) +
+ * (vector_sum_row[i] * b_offset) +
+ * (a_offset * b_offset * k)
+ *
*/
class NEGEMMLowpOffsetContributionKernel : public INEKernel
{
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index 65f1042b9c..4ec0e9df93 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -36,7 +36,10 @@ class ITensor;
* The following computations will be performed by the kernel:
*
* -# Add offset terms to final result
- * -# Multiply each entry of result and round to nearest integer
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds
* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
*
*/
@@ -56,22 +59,44 @@ public:
/** Initialise the kernel's input and output.
*
* @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
* @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
* @param[in] result_offset Offset to be added to each element of the input matrix
* @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
* @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
*/
- void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift);
+ void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
private:
- const ITensor *_input;
- ITensor *_output;
- int32_t _result_offset;
- int32_t _result_mult_int;
- int32_t _result_shift;
+ /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+ */
+ template <bool is_bounded_relu>
+ void run(const Window &window);
+
+ /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel functions
+ *
+ * @param[in] window Region on which to execute the kernel.
+ */
+ using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::*)(const Window &window);
+
+ QuantizeDownFunctionPtr _func;
+ const ITensor *_input;
+ const ITensor *_bias;
+ ITensor *_output;
+ int _result_offset;
+ int _result_mult_int;
+ int _result_shift;
+ int _min;
+ int _max;
};
} // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 8557ef42e1..a3db23aaee 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -43,14 +43,18 @@ class ITensor;
* NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
* The final result is:
*
- * ((input[i][k] + result_offset) * result_mult_int + rounding) >> result_shift
+ * ((input[i][k] + result_offset) * result_mult_int) >> result_shift
*
- * where rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1))
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift
*
* This function calls the following NEON kernels:
*
* -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
*
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ * before the result is shifted right by result_shift
*/
class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction
{
@@ -58,12 +62,17 @@ public:
/** Initialise the kernel's inputs, output
*
* @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
* @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
* @param[in] result_offset Offset to be added to each element of the input matrix
* @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
* @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
*/
- void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift);
+ void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
};
}
#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index aa3c280788..26aaa2a9d5 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -23,10 +23,12 @@
*/
#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
@@ -36,26 +38,173 @@
using namespace arm_compute;
+namespace
+{
+inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
+{
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
+
+ // Multiply by result_mult_int
+ in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
+ in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
+ in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
+ in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
+}
+
+template <bool is_bounded_relu>
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+{
+ const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+ // Shift final result (negative value shift right)
+ in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+ in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+ in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+ in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+ // Saturate negative values
+ in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+ in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+ in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+ in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to U8
+ uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_u8 = vmaxq_u8(out_u8, min_u8);
+ out_u8 = vminq_u8(out_u8, max_u8);
+ }
+
+ return out_u8;
+}
+} // namespace
+
namespace arm_compute
{
class Coordinates;
} // namespace arm_compute
+template <bool is_bounded_relu>
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window)
+{
+ const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset);
+ const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift);
+ const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min));
+ const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(_max));
+
+ ARM_COMPUTE_UNUSED(min_u8);
+ ARM_COMPUTE_UNUSED(max_u8);
+
+ Iterator in(_input, window);
+ Iterator out(_output, window);
+
+ if(_bias != nullptr)
+ {
+ Window win_biases;
+ win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+ win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Iterator bias(_bias, win_biases);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ int32x4x4_t in_s32 =
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+ }
+ };
+
+ const int32x4x4_t bias_s32 =
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result and multiply by result_mult_int
+ scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+ // Add the bias to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+ vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+ },
+ in, bias, out);
+ }
+ else
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ int32x4x4_t in_s32 =
+ {
+ {
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result and multiply by result_mult_int
+ scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+ vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+ },
+ in, out);
+ }
+}
+
NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
- : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0)
+ : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0), _min(0), _max(0)
{
}
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON(max > 255);
+ ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
+ }
_input = input;
+ _bias = bias;
_output = output;
_result_offset = result_offset;
_result_mult_int = result_mult_int;
_result_shift = result_shift;
+ _min = min;
+ _max = max;
constexpr unsigned int num_elems_processed_per_iteration = 16;
@@ -69,9 +218,22 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp
input_access,
output_result_access);
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
+
+ update_window_and_padding(win,
+ bias_access);
+ }
+
output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
INEKernel::configure(win);
+
+ const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+
+ // Check if we need to clamp the result using min and max
+ _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
}
void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
@@ -80,62 +242,5 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, co
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset);
- const int32x4_t result_shift_s32 = vdupq_n_s32(-_result_shift);
- const int32x4_t zero_s32 = vdupq_n_s32(0);
-
- Iterator in(_input, window);
- Iterator out(_output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- int32x4x4_t in_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
-
- // Multiply by c_mult_int
- in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int);
- in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int);
- in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int);
- in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int);
-
- // Shift final result (negative value shift right)
- in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
- in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
- in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
- in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
- // Saturate negative values
- in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
- in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
- in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
- in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
- // Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
-
- // Convert S16 to U8
- const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
- vst1q_u8(out.ptr(), out_u8);
- },
- in, out);
+ (this->*_func)(window);
} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index d09827f908..66cdf58634 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -29,9 +29,9 @@
using namespace arm_compute;
-void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
- k->configure(input, output, result_offset, result_mult_int, result_shift);
+ k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
_kernel = std::move(k);
} \ No newline at end of file
diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp
index 8670a22a66..bf002cf2b5 100644
--- a/tests/validation/CPP/GEMMLowp.cpp
+++ b/tests/validation/CPP/GEMMLowp.cpp
@@ -33,6 +33,36 @@ namespace validation
{
namespace reference
{
+namespace
+{
+template <typename T>
+void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+ int32_t min, int32_t max)
+{
+ const int cols_in = in->shape().x();
+
+ for(int i = 0; i < in->num_elements(); ++i)
+ {
+ int32_t result = ((*in)[i] + result_offset) * result_mult_int;
+
+ if(bias != nullptr)
+ {
+ result += (*bias)[i % cols_in];
+ }
+
+ result >>= result_shift;
+
+ // Bounded ReLu
+ if(min != max)
+ {
+ result = std::max(min, std::min(max, result));
+ }
+
+ (*dst)[i] = static_cast<uint8_t>(std::max(0, std::min(255, result)));
+ }
+}
+} // namespace
+
template <typename T>
SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset)
{
@@ -80,21 +110,31 @@ SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor
}
template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max)
{
SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
- for(int i = 0; i < in.num_elements(); ++i)
- {
- const int32_t result = ((in[i] + result_offset) * result_mult_int) >> result_shift;
- dst[i] = static_cast<uint8_t>(std::max(0, std::min(255, result)));
- }
+ quantize_down_int32_to_uint8_scale<T>(&in, nullptr, &dst, result_offset, result_mult_int, result_shift, min, max);
+
+ return dst;
+}
+
+template <typename T>
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+ int32_t min, int32_t max)
+{
+ SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
+
+ quantize_down_int32_to_uint8_scale<T>(&in, &bias, &dst, result_offset, result_mult_int, result_shift, min, max);
return dst;
}
template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, int32_t a_offset, int32_t b_offset);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min,
+ int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, int32_t result_mult_int,
+ int32_t result_shift, int32_t min, int32_t max);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h
index cbed2206e3..ee33d8e0c0 100644
--- a/tests/validation/CPP/GEMMLowp.h
+++ b/tests/validation/CPP/GEMMLowp.h
@@ -35,14 +35,17 @@ namespace validation
{
namespace reference
{
+SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
+
template <typename T>
SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset);
template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
-
-SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
+template <typename T>
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+ int32_t min = 0, int32_t max = 0);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index ba91ced443..078096a0dd 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -131,34 +131,55 @@ TEST_SUITE(OutputStage)
TEST_SUITE(QuantizeDownInt32ToUint8Scale)
-using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8Scale>;
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
+ 3)
+ * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+
+const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1,
+ 2)
+ * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -4, 4) * framework::dataset::make("result_mult_int", 1, 3) * framework::dataset::make("result_shift", 2,
- 4);
+using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8Scale>;
DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases),
- shape, result_offset, result_mult_int, result_shift)
+ shape, result_offset, result_mult_int, result_shift, min, max, add_bias)
{
+ TensorShape shape_bias(shape[0]);
+
// Create tensors
- Tensor in = create_tensor<Tensor>(shape, DataType::S32);
- Tensor out = create_tensor<Tensor>(shape, DataType::QASYMM8);
+ Tensor in = create_tensor<Tensor>(shape, DataType::S32);
+ Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
+ Tensor out = create_tensor<Tensor>(shape, DataType::QASYMM8);
ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
// Create and configure function
NEGEMMLowpQuantizeDownInt32ToUint8Scale output_stage;
- output_stage.configure(&in, &out, result_offset, result_mult_int, result_shift);
+ output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_offset, result_mult_int, result_shift, min, max);
- // Validate valid region
+ // Validate valid region input and output
const ValidRegion valid_region = shape_to_valid_region(shape);
validate(in.info()->valid_region(), valid_region);
validate(out.info()->valid_region(), valid_region);
+ // Validate valid region bias
+ if(add_bias)
+ {
+ const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
+ validate(bias.info()->valid_region(), valid_region_bias);
+ }
+
// Validate padding
const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
validate(in.info()->padding(), padding);
validate(out.info()->padding(), padding);
+
+ if(add_bias)
+ {
+ validate(bias.info()->padding(), padding);
+ }
}
FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
@@ -173,8 +194,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture,
validate(Accessor(_target), _reference);
}
-TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
+TEST_SUITE(BoundedReLu)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // BoundedReLu
+
+TEST_SUITE(AddBias)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // AddBias
+
+TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
TEST_SUITE_END() // OutputStage
TEST_SUITE_END() // GEMMLowp
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index f9b0dbd959..a99e9323c8 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -122,10 +122,10 @@ class GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture : public framework:
{
public:
template <typename...>
- void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+ void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
{
- _target = compute_target(shape, result_offset, result_mult_int, result_shift);
- _reference = compute_reference(shape, result_offset, result_mult_int, result_shift);
+ _target = compute_target(shape, result_offset, result_mult_int, result_shift, min, max, add_bias);
+ _reference = compute_reference(shape, result_offset, result_mult_int, result_shift, min, max, add_bias);
}
protected:
@@ -136,43 +136,72 @@ protected:
library->fill(tensor, distribution, i);
}
- TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+ TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
{
+ TensorShape shape_bias(shape[0]);
+
// Create tensors
TensorType a = create_tensor<TensorType>(shape, DataType::S32, 1);
- TensorType b = create_tensor<TensorType>(shape, DataType::QASYMM8, 1);
+ TensorType b = create_tensor<TensorType>(shape_bias, DataType::S32, 1);
+ TensorType c = create_tensor<TensorType>(shape, DataType::QASYMM8, 1);
// Create and configure function
FunctionType output_stage;
- output_stage.configure(&a, &b, result_offset, result_mult_int, result_shift);
+ output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_offset, result_mult_int, result_shift, min, max);
ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
// Allocate tensors
a.allocator()->allocate();
- b.allocator()->allocate();
+ c.allocator()->allocate();
ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
- // Fill tensors
+ // Fill tensor
fill(AccessorType(a), 0);
+ if(add_bias)
+ {
+ ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Allocate bias tensor
+ b.allocator()->allocate();
+
+ ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Fill tensor
+ fill(AccessorType(b), 1);
+ }
+
// Compute GEMM function
output_stage.run();
- return b;
+ return c;
}
- SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+ SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
{
// Create reference
+ TensorShape shape_bias(shape[0]);
+
SimpleTensor<int32_t> a{ shape, DataType::S32, 1 };
+ SimpleTensor<int32_t> b{ shape_bias, DataType::S32, 1 };
// Fill reference
fill(a, 0);
- return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int, result_shift);
+ if(add_bias)
+ {
+ // Fill bias
+ fill(b, 1);
+
+ return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, b, result_offset, result_mult_int, result_shift, min, max);
+ }
+ else
+ {
+ return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int, result_shift, min, max);
+ }
}
TensorType _target{};