From f29d1b7d8bf2d1619554eb3443556b44d4aa1a4c Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Tue, 29 Oct 2019 10:58:13 +0000 Subject: COMPMID-2608: Enable quantization with multiplier greater than 1 on NEON Change-Id: Ib2b0c9ac88fc2b645f478c9981f71ee28f2c77fd Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2425 Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- arm_compute/core/NEON/NEAsymm.h | 160 +++++++++++++++------ arm_compute/core/utils/quantization/AsymmHelpers.h | 10 +- .../runtime/NEON/functions/NEFullyConnectedLayer.h | 24 ++-- .../NEON/functions/NEGEMMConvolutionLayer.h | 8 +- .../NEDepthwiseConvolutionLayerNativeKernel.cpp | 33 +++-- .../NEDirectConvolutionLayerOutputStageKernel.cpp | 7 +- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 8 +- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 7 +- src/core/utils/quantization/AsymmHelpers.cpp | 10 +- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 12 +- .../NEON/functions/NEDirectConvolutionLayer.cpp | 5 +- .../NEON/functions/NEFullyConnectedLayer.cpp | 23 +-- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 4 +- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 13 ++ .../NEDepthwiseConvolutionAssemblyDispatch.cpp | 10 ++ tests/validation/NEON/ConvolutionLayer.cpp | 3 +- tests/validation/NEON/DeconvolutionLayer.cpp | 34 +++-- .../validation/NEON/DepthwiseConvolutionLayer.cpp | 94 ++++++------ tests/validation/NEON/FullyConnectedLayer.cpp | 10 +- tests/validation/fixtures/GEMMLowpFixture.h | 2 +- 20 files changed, 307 insertions(+), 170 deletions(-) diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 67adcef9b1..c09a7d9028 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -88,17 +88,32 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32, { const static int32x4_t zero_s32 = vdupq_n_s32(0); - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); - - // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); - in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); - in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); - in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); + if(result_shift < 0) + { + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); + + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); + in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); + in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); + } // Add the offset terms in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); @@ -154,17 +169,32 @@ int8x16_t finalize_quantization(int32x4x4_t &in_s32, int8x16_t min_s8, int8x16_t max_s8) { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); - in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); - in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); - in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); - - // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); - in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); - in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); - in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); + if(result_shift < 0) + { + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift))); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift))); + + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier); + in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier); + in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier); + in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier); + + // Round to the nearest division by a power-of-two using result_shift_s32 + in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift); + in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift); + in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift); + in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift); + } // Add the offset terms in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); @@ -214,17 +244,54 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, const int8x16_t &min_s8, const int8x16_t &max_s8) { - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]); - in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]); - in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]); - in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]); + const static int32x4_t one_s32 = vdupq_n_s32(1); + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + int32x4x4_t res_shift_gt0 = + { + vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), + vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), + vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), + vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]), + }; // Round to the nearest division by a power-of-two using result_shift_s32 - in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]); - in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]); - in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]); - in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]); + res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]); + res_shift_gt0.val[1] = rounding_divide_by_pow2(res_shift_gt0.val[1], result_shift.val[1]); + res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]); + res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]); + + int32x4x4_t res_shift_lt0 = + { + vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))), + vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))), + vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))), + vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))), + }; + res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]); + res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]); + res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]); + res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); + + // Select result depending on shift value + const uint32x4x4_t mask_lt0 = + { +#ifdef __aarch64__ + vcltzq_s32(result_shift.val[0]), + vcltzq_s32(result_shift.val[1]), + vcltzq_s32(result_shift.val[2]), + vcltzq_s32(result_shift.val[3]), +#else //__aarch64__ + vcltq_s32(result_shift.val[0], vdupq_n_s32(0)), + vcltq_s32(result_shift.val[1], vdupq_n_s32(0)), + vcltq_s32(result_shift.val[2], vdupq_n_s32(0)), + vcltq_s32(result_shift.val[3], vdupq_n_s32(0)), +#endif //__aarch64__ + }; + + in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]); + in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]); + in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]); + in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]); // Add the offset terms in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32); @@ -273,11 +340,17 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul { int32x4_t in_s32 = vdupq_n_s32(in_value); - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); - - // Shift value by result_shift_s32 - in_value = rounding_divide_by_pow2(in_value, result_shift); + if(result_shift < 0) + { + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(in_value, result_shift); + } // Add the offset term in_value += result_offset_after_shift_s32; @@ -312,11 +385,18 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult { int32x4_t in_s32 = vdupq_n_s32(in_value); - // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); + if(result_shift < 0) + { + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + } + else + { + // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar + in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0); - // Shift value by result_shift_s32 - in_value = rounding_divide_by_pow2(in_value, result_shift); + // Shift value by result_shift_s32 + in_value = rounding_divide_by_pow2(in_value, result_shift); + } // Add the offset term in_value += result_offset_after_shift_s32; diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h index 1bdc9959c8..94876fb02f 100644 --- a/arm_compute/core/utils/quantization/AsymmHelpers.h +++ b/arm_compute/core/utils/quantization/AsymmHelpers.h @@ -60,7 +60,7 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *q */ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift); -/** Calculate quantized representation of per-channel multipliers with value less than one. +/** Calculate quantized representation of per-channel multipliers * * @param[in] iq_info Input quantization info. * @param[in] wq_info Weights quantization info. @@ -69,10 +69,10 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t * * @return a status */ -Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info, - const QuantizationInfo &wq_info, - const QuantizationInfo &oq_info, - GEMMLowpOutputStageInfo &stage_info); +Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, + GEMMLowpOutputStageInfo &stage_info); /** Get minimum and maximum values for the input quantized data type * diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index 8150737ebe..784637a796 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -131,7 +131,7 @@ public: * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. * Data type supported: Same as @p input. - * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8. * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. @@ -142,17 +142,17 @@ public: FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer * - * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor info. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. - * @param[out] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8. + * @param[in] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info * * @return a status */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 665d4f1bae..660f55953e 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -64,15 +64,17 @@ public: /** Set the input and output tensors. * * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. * @param[out] output Destination tensor. Data types supported: Same as @p weights. */ void configure(const ITensor *weights, const ITensor *biases, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights * * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. - * @param[in] output Destination tensor info. Data types supported: Same as @p weights. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[in] output Destination tensor. Data types supported: Same as @p weights. * * @return an error status */ diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp index a9a3183c5d..aee13ee578 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp @@ -289,7 +289,16 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w acc.at(i) += *reinterpret_cast(biases_it.ptr() + i * sizeof(int32_t)); } - acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), output_multiplier.at(id.x() + i)), output_shift.at(id.x() + i)) + output_qoffset; + const int out_mul = output_multiplier.at(id.x() + i); + const int out_shift = output_shift.at(id.x() + i); + if(out_shift < 0) + { + acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset; + } out_vals[i] = static_cast(utility::clamp(acc.at(i))); } @@ -381,21 +390,20 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh if(has_biases) { - const auto biases_val = *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + acc.at(m) += *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); + } - int32_t out_val = acc.at(m) + biases_val; - out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(out_val, output_multiplier.at(id.x() + m)), - output_shift.at(id.x() + m)) - + output_qoffset; - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(out_val)); + const int out_mul = output_multiplier.at(id.x() + m); + const int out_shift = output_shift.at(id.x() + m); + if(out_shift < 0) + { + acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; } else { - int32_t out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), output_multiplier.at(id.x() + m)), - output_shift.at(id.x() + m)) - + output_qoffset; - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(out_val)); + acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset; } + *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(acc.at(m))); } }, input_it, weights_it, biases_it, output_it); @@ -531,8 +539,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co int32_t out_mult = 0; int32_t out_shift = 0; const float multiplier = input_scale * weights_scale.at(i) / output_scale; - ARM_COMPUTE_ERROR_ON(multiplier > 1.f); - arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &out_mult, &out_shift); + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift); _output_multiplier.push_back(out_mult); _output_shift.push_back(out_shift); diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 4313a5e312..8834d9747a 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -38,14 +38,15 @@ #include #include -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); ARM_COMPUTE_UNUSED(result_offset_after_shift); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); @@ -53,7 +54,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con DataType::F16, DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(result_shift < 0, "Result shift must be a non negative integer"); if(bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::S32, DataType::F32); @@ -596,3 +596,4 @@ void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); } +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index 84187332f8..86abb2d65c 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -909,8 +909,12 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); // Check if we need to clamp the result using min and max - const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound) - && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255)); + PixelValue type_min = 0; + PixelValue type_max = 0; + std::tie(type_min, type_max) = get_min_max(output->info()->data_type()); + int type_min_int = type_min.get(); + int type_max_int = type_max.get(); + const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound == type_min_int && output_stage.gemmlowp_max_bound == type_max_int); // Check if we need to perform fixed point requantization const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 4906e6a987..bb0b86404e 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -39,8 +39,8 @@ #include #include -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) @@ -244,4 +244,5 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); (this->*_func)(window); -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 11241e83a0..5bda746e09 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -106,10 +106,10 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, return Status{}; } -arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info, - const QuantizationInfo &wq_info, - const QuantizationInfo &oq_info, - GEMMLowpOutputStageInfo &stage_info) +arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, + GEMMLowpOutputStageInfo &stage_info) { ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty()); ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty()); @@ -131,7 +131,7 @@ arm_compute::Status calculate_quantized_multipliers_less_than_one(const Quantiza const float multiplier = i_scale * w_scales[i] / o_scale; int32_t quant_multiplier = 0; int32_t quant_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift)); + ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier(multiplier, &quant_multiplier, &quant_shift)); quant_multipliers[i] = quant_multiplier; quant_shifts[i] = quant_shift; } diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index ca4fe732a7..ddcc71f466 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -60,16 +60,6 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type()); - if(is_quantized) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); - - float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); - } if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation)) { TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); @@ -205,7 +195,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; int32_t output_multiplier; int32_t output_shift; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset); _accumulator.allocator()->allocate(); } diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index 322bb2c425..65538848df 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -31,8 +31,8 @@ #include #include -using namespace arm_compute; - +namespace arm_compute +{ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_activationlayer_enabled(false), _dim_split(Window::DimZ) @@ -118,3 +118,4 @@ void NEDirectConvolutionLayer::run() _activationlayer_function.run(); } } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index b3b90f8599..01746eb3db 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -33,7 +33,8 @@ #include #include -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -258,7 +259,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; int32_t output_multiplier; int32_t output_shift; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset); _gemmlowp_output.allocator()->allocate(); } @@ -352,13 +353,14 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn // Validate output stage for asymmetric quantized types if(is_quantized) { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); - const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + + float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; + int output_multiplier; + int output_shift; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output)); } @@ -475,4 +477,5 @@ void NEFullyConnectedLayer::prepare() _is_prepared = true; } -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 0507c6b2bd..920917a58b 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -154,7 +154,7 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w output_info.gemmlowp_min_bound = min_activation; output_info.gemmlowp_max_bound = max_activation; output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info)); @@ -217,7 +217,7 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens output_info.gemmlowp_min_bound = min_activation; output_info.gemmlowp_max_bound = max_activation; output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info)); // Perform validation step on GEMMLowp std::unique_ptr input_qa = input->clone(); diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index e36cb3d399..440f043527 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -344,6 +344,19 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso { run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info)); run_optimised_requantized = run_optimised; + + const UniformQuantizationInfo a_qinfo = a_to_use->quantization_info().uniform(); + const QuantizationInfo b_qinfo = b->quantization_info(); + const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); + for(auto const s : b_qinfo.scale()) + { + const float fmultipler = a_qinfo.scale * s / output_qinfo.scale; + if(fmultipler > 1.f) + { + run_optimised_requantized = false; + break; + } + } } else { diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp index 3235eee19a..142f873ef4 100644 --- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp @@ -437,6 +437,16 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } + // The uniform quantization case will only have 1 scale value in the weights quantization info + const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform(); + const QuantizationInfo weights_qinfo = weights->quantization_info(); + const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); + for(auto const s : weights_qinfo.scale()) + { + const float fmultipler = input_qinfo.scale * s / output_qinfo.scale; + ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f); + } + return Status{}; } diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 1d7805d024..fbc5a830a9 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -80,6 +80,7 @@ const auto QuantizationData = framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10), QuantizationInfo(0.3f, 3), QuantizationInfo(1.f, 10), + QuantizationInfo(1.1f, 10), }); } // namespace @@ -482,7 +483,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedPerChannelFixtur framework::dataset::make("DataType", { DataType::QASYMM8 })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), QuantizationData), - ActivationFunctionsDataset), + QuantizedActivationFunctionsDataset), framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL }))) { // Validate output diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp index b6911c6dbc..89f9d98ed5 100644 --- a/tests/validation/NEON/DeconvolutionLayer.cpp +++ b/tests/validation/NEON/DeconvolutionLayer.cpp @@ -68,6 +68,18 @@ const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset:: const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }); const auto add_bias_dataset = framework::dataset::make("AddBias", { true, false }); + +const auto input_qinfo_dataset = framework::dataset::make("InputQInfo", +{ + QuantizationInfo(1.f / 255.f, 0), + QuantizationInfo(2.f, 0), +}); + +const auto output_qinfo_dataset = framework::dataset::make("OutputQInfo", +{ + QuantizationInfo(3.f / 255.f, 0), + QuantizationInfo(4.f, 0), +}); } // namespace TEST_SUITE(NEON) @@ -161,16 +173,16 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( // *INDENT-ON* template -using NEDeconvolutionLayerFixture4x4 = DeconvolutionValidationFixture; +using NEDeconvolutionLayerFixture4x4 = DeconvolutionValidationFixture; template -using NEDeconvolutionLayerFixture3x3 = DeconvolutionValidationFixture; +using NEDeconvolutionLayerFixture3x3 = DeconvolutionValidationFixture; template using NEDeconvolutionLayerAsymmFixture3x3 = DeconvolutionValidationAsymmFixture; template -using NEDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture; +using NEDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture; TEST_SUITE(Float) TEST_SUITE(FP32) @@ -277,8 +289,8 @@ TEST_SUITE(W4x4) FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data4x4, framework::dataset::make("DataType", DataType::QASYMM8)), data_layouts_dataset), - framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })), - framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })), + input_qinfo_dataset), + output_qinfo_dataset), add_bias_dataset)) { // Validate output @@ -291,8 +303,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::QASYMM8)), data_layouts_dataset), - framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })), - framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })), + input_qinfo_dataset), + output_qinfo_dataset), add_bias_dataset)) { // Validate output diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp index 6d8c083c3f..fbfb16ab04 100644 --- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp @@ -62,6 +62,12 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo ActivationLayerInfo(), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }); + +const auto input_qinfo_dataset = framework::dataset::make("InputQInfo", +{ + QuantizationInfo(0.3f, 10), + QuantizationInfo(2.2f, 10), +}); } // namespace TEST_SUITE(NEON) @@ -274,12 +280,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture, fram } TEST_SUITE(Dilation) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), - depth_multipliers), - framework::dataset::make("DataType", - DataType::F32)), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), - ActivationFunctionsDataset)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) { validate(Accessor(_target), _reference, tolerance_f32); } @@ -296,12 +302,12 @@ TEST_SUITE_END() // Dilation TEST_SUITE_END() // Generic TEST_SUITE(W3x3) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), - depth_multipliers), - framework::dataset::make("DataType", - DataType::F32)), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), - ActivationFunctionsDataset)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) { validate(Accessor(_target), _reference, tolerance_f32); } @@ -316,7 +322,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture, fram validate(Accessor(_target), _reference, tolerance_f32); } TEST_SUITE(Dilation) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers), framework::dataset::make("DataType", @@ -397,12 +403,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture, frame } TEST_SUITE(Dilation) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), - depth_multipliers), - framework::dataset::make("DataType", - DataType::F16)), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), - ActivationFunctionsDataset)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) { validate(Accessor(_target), _reference, tolerance_f16, tolerance_num); } @@ -421,12 +427,12 @@ TEST_SUITE_END() // Generic template using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture; TEST_SUITE(W3x3) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), - depth_multipliers), - framework::dataset::make("DataType", - DataType::F16)), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), - ActivationFunctionsDataset)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) { validate(Accessor(_target), _reference, tolerance_f16); } @@ -443,7 +449,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture, frame TEST_SUITE(Dilation) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers), framework::dataset::make("DataType", @@ -517,7 +523,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -565,7 +571,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOpti combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(), large_depth_multipliers), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -578,7 +584,7 @@ TEST_SUITE(Dilation) FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -589,7 +595,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOpti combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(), large_depth_multipliers), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -605,7 +611,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureO framework::dataset::make("DepthMultiplier", 1)), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -617,7 +623,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureO framework::dataset::make("DepthMultiplier", 1)), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -629,7 +635,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureO framework::dataset::make("DepthMultiplier", 1)), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -640,12 +646,12 @@ TEST_SUITE_END() // Optimized TEST_SUITE_END() // QASYMM8 TEST_SUITE(QSYMM8_PER_CHANNEL) TEST_SUITE(Generic) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::ALL, +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers), framework::dataset::make("InputDataType", DataType::QASYMM8)), framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -654,12 +660,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPe } TEST_SUITE(Dilation) -FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::ALL, +FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), depth_multipliers), framework::dataset::make("InputDataType", DataType::QASYMM8)), framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -671,7 +677,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPe depth_multipliers), framework::dataset::make("InputDataType", DataType::QASYMM8)), framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -682,12 +688,12 @@ TEST_SUITE_END() // Dilation TEST_SUITE_END() // Generic TEST_SUITE(Optimized) -FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::ALL, +FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(), framework::dataset::make("DepthMultiplier", 1)), framework::dataset::make("InputDataType", DataType::QASYMM8)), framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), framework::dataset::make("DataLayout", { DataLayout::NHWC })), ActivationFunctionsDataset)) @@ -699,7 +705,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetri framework::dataset::make("DepthMultiplier", 1)), framework::dataset::make("InputDataType", DataType::QASYMM8)), framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), - framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + input_qinfo_dataset), framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), framework::dataset::make("DataLayout", { DataLayout::NHWC })), ActivationFunctionsDataset)) diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp index 7b32ae4fb9..a7b837fedf 100644 --- a/tests/validation/NEON/FullyConnectedLayer.cpp +++ b/tests/validation/NEON/FullyConnectedLayer.cpp @@ -64,6 +64,12 @@ const auto CNNDataTypes = framework::dataset::make("DataType", }); const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true })); + +const auto QuantizationData = framework::dataset::make("QuantizationInfo", +{ + QuantizationInfo(1.f / 256.f, 10), + QuantizationInfo(1.1f, 10), +}); } // namespace TEST_SUITE(NEON) @@ -214,7 +220,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 10) }))) + QuantizationData)) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); @@ -223,7 +229,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 256.f, 10) }))) + QuantizationData)) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index db52be5062..b93a6447d7 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -217,7 +217,7 @@ public: output_stage.gemmlowp_shifts.resize(num_channels); for(size_t i = 0; i < num_channels; ++i) { - quantization::calculate_quantized_multiplier_less_than_one(scales[i], &output_stage.gemmlowp_multipliers[i], &output_stage.gemmlowp_shifts[i]); + quantization::calculate_quantized_multiplier(scales[i], &output_stage.gemmlowp_multipliers[i], &output_stage.gemmlowp_shifts[i]); } _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_b, QuantizationInfo(scales)); -- cgit v1.2.1