From 14cbfb2921990d8bf125231e350e2ac8dcd95a8b Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 23 Oct 2019 10:53:10 +0100 Subject: COMPMID-2609: Enable quantization with multiplier greater than 1 on OpenCL Change-Id: I050f1f84e214e61f7cbb0197a672b68a4940edae Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2158 Comments-Addressed: Arm Jenkins Reviewed-by: Manuel Bottini Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena --- .../runtime/CL/functions/CLGEMMConvolutionLayer.h | 11 +- .../CL/functions/CLGEMMDeconvolutionLayer.h | 7 +- .../cl_kernels/depthwise_convolution_quantized.cl | 146 +++++++++++++++------ .../CL/cl_kernels/direct_convolution_quantized.cl | 18 +-- src/core/CL/cl_kernels/gemmlowp.cl | 22 +++- src/core/CL/cl_kernels/helpers_asymm.h | 2 + .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp | 8 ++ .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 8 ++ .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 20 +++ .../CL/kernels/CLDirectConvolutionLayerKernel.cpp | 74 +++++++---- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 6 +- src/core/utils/quantization/AsymmHelpers.cpp | 3 +- src/runtime/CL/functions/CLConvolutionLayer.cpp | 4 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 2 +- .../CL/functions/CLGEMMDeconvolutionLayer.cpp | 17 ++- tests/validate_examples/cl_gemm.cpp | 8 +- tests/validation/CL/ConvolutionLayer.cpp | 2 +- tests/validation/CL/DeconvolutionLayer.cpp | 24 ++-- tests/validation/CL/DepthwiseConvolutionLayer.cpp | 16 +-- tests/validation/CL/DirectConvolutionLayer.cpp | 10 +- tests/validation/CL/FullyConnectedLayer.cpp | 20 +-- tests/validation/reference/Convolution3d.h | 13 +- .../reference/DepthwiseConvolutionLayer.cpp | 7 +- tests/validation/reference/FullyConnectedLayer.cpp | 7 +- tests/validation/reference/UtilsQuantizedAsymm.h | 26 +++- 25 files changed, 319 insertions(+), 162 deletions(-) diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h index 3392f11b06..ce034cd9ba 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h @@ -21,14 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ -#define __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ +#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H +#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h" -#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h" #include "arm_compute/core/Types.h" @@ -36,8 +34,6 @@ #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" -#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/ITransformWeights.h" #include "arm_compute/runtime/IWeightsManager.h" @@ -143,7 +139,6 @@ private: * -# @ref CLGEMM (if the data type is FP32 or FP16) * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8) * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8) - * -# @ref CLElementwiseOperationKernel for addition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout) * -# @ref CLCol2ImKernel (if NCHW data layout) */ class CLGEMMConvolutionLayer : public IFunction @@ -267,4 +262,4 @@ private: bool _is_prepared; }; } // namespace arm_compute -#endif /* __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ */ +#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h index 0a71995158..3df9205f48 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h @@ -21,12 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H__ -#define __ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H__ +#ifndef ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H +#define ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H #include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/CL/functions/CLPermute.h" #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" #include "arm_compute/runtime/CL/functions/CLSlice.h" @@ -141,4 +142,4 @@ private: bool _is_quantized; }; } // namespace arm_compute -#endif /* __ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H__ */ +#endif /* ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H */ diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 08358755b1..ac1406b6d8 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -47,7 +47,7 @@ #define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size) -#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) +#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) #if defined(WEIGHTS_PROMOTED_TYPE) #define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size) @@ -224,9 +224,6 @@ __kernel void dwc_3x3_native_quantized8_nchw( #if defined(PER_CHANNEL_QUANTIZATION) const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel)); const int output_shift = *((__global int *)vector_offset(&output_shifts, channel)); -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0)); - const int output_shift = *((__global int *)vector_offset(&output_shifts, 0)); #endif // defined(PER_CHANNEL_QUANTIZATION) int8 values0 = 0; @@ -335,7 +332,17 @@ __kernel void dwc_3x3_native_quantized8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8); + int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); + values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -351,7 +358,17 @@ __kernel void dwc_3x3_native_quantized8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8); + int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); + values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -667,7 +684,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8); + int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); + values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -684,7 +711,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8); + int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); + values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -943,17 +980,23 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc( acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); #else // defined(REAL_MULTIPLIER) + #if defined(PER_CHANNEL_QUANTIZATION) Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers); Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); -#endif // defined(PER_CHANNEL_QUANTIZATION) - acc = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift); + VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE); + VEC_INT res_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift); + acc = select(res_shift_lt0, res_shift_gt0, output_shift >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + acc = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE); +#else // OUTPUT_SHIFT < 0 + acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // OUTPUT_SHIFT < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -1255,15 +1298,32 @@ __kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc( Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); -#endif // defined(PER_CHANNEL_QUANTIZATION) - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); - acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); - acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); + res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE); + res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE); + res2_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE); + res3_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE); + res0_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + res1_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); + res2_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); + res3_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); + acc0 = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0); + acc1 = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0); + acc2 = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0); + acc3 = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE); + acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE); + acc2 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE); + acc3 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE); +#else // OUTPUT_SHIFT < 0 + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); + acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); + acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); +#endif // OUTPUT_SHIFT < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -1375,7 +1435,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -1383,7 +1443,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -1519,11 +1579,14 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); #else // defined(REAL_MULTIPLIER) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); +#if OUTPUT_SHIFT < 0 + acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE); + acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE); +#else // OUTPUT_SHIFT < 0 + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // OUTPUT_SHIFT < 0 #endif // defined(REAL_MULTIPLIER) acc0 += (VEC_INT)OUTPUT_OFFSET; @@ -1553,9 +1616,9 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( #endif // defined(WEIGHTS_PROMOTED_TYPE) -#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) +#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) -#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) +#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped * * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2) @@ -1630,7 +1693,7 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0; @@ -1646,12 +1709,6 @@ __kernel void dwc_MxN_native_quantized8_nhwc( #if defined(PER_CHANNEL_QUANTIZATION) __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; - - VEC_INT output_multiplier = (VEC_INT)0; - VEC_INT output_shift = (VEC_INT)0; -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); #endif // defined(PER_CHANNEL_QUANTIZATION) #if defined(DST_DEPTH) @@ -1700,11 +1757,20 @@ __kernel void dwc_MxN_native_quantized8_nhwc( #endif // defined(HAS_BIAS) #if defined(PER_CHANNEL_QUANTIZATION) - output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr)); - output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr)); + VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr)); + VEC_INT output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr)); + + VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0); + VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0); + res = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + res = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0); +#else // OUTPUT_SHIFT < 0 + res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0); +#endif // OUTPUT_OFFSET < 0 #endif // defined(PER_CHANNEL_QUANTIZATION) - res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0); res += (VEC_INT)OUTPUT_OFFSET; VEC_TYPE(VEC_SIZE) @@ -1726,5 +1792,5 @@ __kernel void dwc_MxN_native_quantized8_nhwc( } #endif // DEPTH_MULTIPLIER > 1 } -#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) +#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE) diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl index 1182428cd5..37fd9a0778 100644 --- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/direct_convolution_quantized.cl @@ -25,7 +25,7 @@ #undef CONVERT_SAT -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) +#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) #if KERNEL_SIZE == 9 @@ -194,6 +194,8 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel) * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH * @note If biases are used then -DHAS_BIAS has to be passed at compile time + * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 + * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -227,8 +229,6 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel) * @param[in] input_offset Input offset quantization parameter * @param[in] weight_offset Weights offset quantization parameter * @param[in] output_offset Output offset quantization parameter - * @param[in] output_multiplier Output integer multiplier quantization parameter - * @param[in] output_shift Output integer shift quantization parameter */ __kernel void direct_convolution_quantized( TENSOR3D_DECLARATION(src), @@ -240,9 +240,7 @@ __kernel void direct_convolution_quantized( unsigned int weights_stride_w, int input_offset, int weight_offset, - int output_offset, - int output_multiplier, - int output_shift) + int output_offset) { Image src = CONVERT_TO_IMAGE_STRUCT(src); Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); @@ -294,9 +292,13 @@ __kernel void direct_convolution_quantized( pixels0 += (int8)(*bias_addr); #endif /* defined(HAS_BIAS) */ - pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, output_multiplier, output_shift, 8); +#if OUTPUT_SHIFT < 0 + pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_SHIFT < 0 pixels0 = pixels0 + output_offset; vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr); } -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) +#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 7a97fa6fa1..fa08b149e4 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -1673,9 +1673,17 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr); int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr); - in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4); -#else // !defined(PER_CHANNEL_QUANTIZATION) + int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4); + int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4); + in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) + +#if RESULT_SHIFT < 0 + in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + #endif // defined(PER_CHANNEL_QUANTIZATION) // Add the offset terms to GEMM's result @@ -1768,7 +1776,11 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), // Multiply by result_mult_int and shift input_values *= RESULT_MULT_INT; +#if RESULT_SHIFT < 0 + input_values >>= -RESULT_SHIFT; +#else // RESULT_SHIFT >= 0 input_values >>= RESULT_SHIFT; +#endif // RESULT_SHIFT < 0 uchar4 res = convert_uchar4_sat(input_values); @@ -1850,7 +1862,11 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO #endif // defined(ADD_BIAS) // Multiply by result_mult_int and shift +#if RESULT_SHIFT < 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 // Add the offset terms to GEMM's result input_values += (int4)RESULT_OFFSET_AFTER_SHIFT; @@ -1937,7 +1953,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE // Multiply by result_mult_int and shift #if RESULT_SHIFT < 0 - input_values = ASYMM_MULT(input_values * (1 << (-RESULT_SHIFT)), RESULT_FIXEDPOINT_MULTIPLIER, 4); + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); #else // RESULT_SHIFT >= 0 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); #endif // RESULT_SHIFT < 0 diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index f7eff758c0..09409dc5e9 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -369,6 +369,8 @@ inline float dequantize_qasymm8(uchar input, float offset, float scale) #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ + ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index a2f4a913ce..d9705de5a4 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -297,6 +297,14 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8"); + // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + if(act_info.enabled()) { const int a_val = quantize_qasymm8(act_info.a(), oq_info); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index d5f37f32ce..8db85edc62 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -260,6 +260,14 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8"); + // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + if(act_info.enabled()) { const int a_val = quantize_qasymm8(act_info.a(), oq_info); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 3fc236eaa7..2155306d62 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -107,6 +107,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } + if(is_data_type_quantized(input->data_type())) + { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + } + return Status{}; } @@ -236,6 +248,14 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION"); + // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + if(dwc_info.activation_info.enabled()) { const int a_val = quantize_qasymm8(dwc_info.activation_info.a(), oq_info); diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index 7b74a5a98c..e61e5c3901 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -38,8 +38,8 @@ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "support/ToolchainSupport.h" -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) @@ -69,7 +69,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, { const auto supported_data_layout = is_data_type_quantized(data_type) ? DataLayout::NCHW : DataLayout::NHWC; const auto error_message = std::string("Only " + string_from_data_layout(supported_data_layout) + " layout is supported for 9x9 convolution with " + string_from_data_type( - data_type) + " type"); + data_type) + + " type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((supported_data_layout != data_layout), error_message.c_str()); } @@ -98,6 +99,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = input->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + } return Status{}; } @@ -483,8 +495,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL } else { - const bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type); - build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size))); build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx)))); @@ -508,9 +518,35 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL } } build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_quantized" : kernel_name.str(), - build_options.options())); + + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("direct_convolution_quantized", build_options.options())); + + // Set static kernel arguments + unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1; + _kernel.setArg(idx++, -iqinfo.offset); + _kernel.setArg(idx++, -wqinfo.offset); + _kernel.setArg(idx++, oqinfo.offset); + } + else + { + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options())); + } } // Configure kernel window @@ -518,27 +554,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); - // Set static kernel arguments - if(is_data_type_quantized_asymmetric(data_type)) - { - const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform(); - - int output_multiplier = 0; - int output_shift = 0; - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift)); - - unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1; - _kernel.setArg(idx++, -iqinfo.offset); - _kernel.setArg(idx++, -wqinfo.offset); - _kernel.setArg(idx++, oqinfo.offset); - _kernel.setArg(idx++, output_multiplier); - _kernel.setArg(idx++, output_shift); - } - // Set config_id for enabling LWS tuning _config_id = "direct_convolution_"; _config_id += lower_string(string_from_data_type(data_type)); @@ -614,3 +629,4 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue } while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); } +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 7c066381ce..a98eae673b 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -35,8 +35,6 @@ #include "support/ToolchainSupport.h" -using namespace arm_compute; - namespace arm_compute { namespace @@ -100,9 +98,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -class Coordinates; -} // namespace arm_compute - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() : _input(nullptr), _bias(nullptr), _output(nullptr) { @@ -180,3 +175,4 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window } while(collapsed.slide_window_slice_3D(slice)); } +} // namespace arm_compute diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 7e22a814b5..0551874d5f 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -191,8 +191,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, int output_multiplier = 0; int output_shift = 0; const float multiplier = iq_info.scale * wq_info.scale()[i] / oq_info.scale; - ARM_COMPUTE_ERROR_ON(multiplier > 1.0f); - calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); output_multipliers_ptr[i] = output_multiplier; output_shifts_ptr[i] = output_shift; diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index d794cde1f4..c271f502e9 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -34,7 +34,8 @@ #include #include -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_manager) @@ -216,3 +217,4 @@ void CLConvolutionLayer::prepare() { _function->prepare(); } +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index a8167ce8f7..ad0714ed15 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -60,7 +60,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // Set the GEMMLowp output stage info gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp index 4671be5b61..604147a37a 100644 --- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp @@ -152,9 +152,9 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso } const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); - const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); - TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); + auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info); + const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true); if(padded_input && is_quantized) { @@ -173,6 +173,15 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso else if(is_quantized) { ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info)); + + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + int output_multiplier(0); + int output_shift(0); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output)); } else @@ -284,7 +293,7 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier(0); int output_shift(0); - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, oq_info.offset); _gemmlowp_final.allocator()->allocate(); } diff --git a/tests/validate_examples/cl_gemm.cpp b/tests/validate_examples/cl_gemm.cpp index 39fe111448..cdf60cd65b 100644 --- a/tests/validate_examples/cl_gemm.cpp +++ b/tests/validate_examples/cl_gemm.cpp @@ -195,11 +195,13 @@ public: consume_params(gemm_options); print_parameters_internal(); + const bool is_quantized = is_data_type_quantized(data_type); + // Calculate re-quantization parameters - if(data_type == DataType::QASYMM8) + if(is_quantized) { float multiplier = scale_src0 * scale_src1 / scale_dst; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &dst_multiplier, &dst_shift); + quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift); } // Initialize GEMM inputs/outputs @@ -209,7 +211,7 @@ public: init_sgemm_output(dst, src0, src1, data_type); // Configure function - if(data_type == DataType::QASYMM8) + if(is_quantized) { src0.info()->set_quantization_info(QuantizationInfo(scale_src0, offset_src0)); src1.info()->set_quantization_info(QuantizationInfo(scale_src1, offset_src1)); diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp index 9eb6c6d41d..5ee5d849c3 100644 --- a/tests/validation/CL/ConvolutionLayer.cpp +++ b/tests/validation/CL/ConvolutionLayer.cpp @@ -292,7 +292,7 @@ const auto QuantizationData = framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(0.3f, 3), - QuantizationInfo(1.f, 10), + QuantizationInfo(1.1f, 10), }); TEST_SUITE(QASYMM8) diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp index ac8e170b80..090db1c2c1 100644 --- a/tests/validation/CL/DeconvolutionLayer.cpp +++ b/tests/validation/CL/DeconvolutionLayer.cpp @@ -123,19 +123,19 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( // *INDENT-ON* template -using CLDeconvolutionLayerFixture4x4 = DeconvolutionValidationFixture; +using CLDeconvolutionLayerFixture4x4 = DeconvolutionValidationFixture; template -using CLDeconvolutionLayerFixture3x3 = DeconvolutionValidationFixture; +using CLDeconvolutionLayerFixture3x3 = DeconvolutionValidationFixture; template using CLDeconvolutionLayerAsymmFixture3x3 = DeconvolutionValidationAsymmFixture; template -using CLDeconvolutionLayerFixture2x2 = DeconvolutionValidationFixture; +using CLDeconvolutionLayerFixture2x2 = DeconvolutionValidationFixture; template -using CLDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture; +using CLDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture; TEST_SUITE(Float) TEST_SUITE(FP32) @@ -286,8 +286,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture3x3, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(data2x2_precommit, framework::dataset::make("DataType", DataType::QASYMM8)), data_layouts_dataset), - framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })), - framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })), + framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f, 0) })), + framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(3.f, 0) })), add_bias_dataset)) { // Validate output @@ -323,8 +323,8 @@ TEST_SUITE(W1x1) FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType", DataType::QASYMM8)), data_layouts_dataset), - framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })), - framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })), + framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f, 0) })), + framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(3.f, 0) })), add_bias_dataset)) { // Validate output diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp index 5d8fd200d3..e2cdf5403a 100644 --- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp @@ -563,7 +563,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit, framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })), QuantizedActivationFunctionsDataset)) { // Validate output @@ -260,7 +260,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit_9x9, framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })), QuantizedActivationFunctionsDataset)) { // Validate output @@ -268,7 +268,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly, framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })), QuantizedActivationFunctionsDataset)) { // Validate output @@ -276,7 +276,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_9x9, framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })), QuantizedActivationFunctionsDataset)) { // Validate output @@ -288,7 +288,7 @@ TEST_SUITE_END() // QASYMM8 TEST_SUITE(QASYMM8_CustomDataset) FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::DirectConvolutionLayerDataset(), framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127) })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) })), QuantizedActivationFunctionsDataset)) { // Validate output diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp index af45be2cf0..091d9411b7 100644 --- a/tests/validation/CL/FullyConnectedLayer.cpp +++ b/tests/validation/CL/FullyConnectedLayer.cpp @@ -60,6 +60,12 @@ const auto CNNDataTypes = framework::dataset::make("DataType", }); const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true })); + +const auto QuantizationData = framework::dataset::make("QuantizationInfo", +{ + QuantizationInfo(1.f / 255.f, 10), + QuantizationInfo(1.1f, 10), +}); } // namespace TEST_SUITE(CL) @@ -204,20 +210,14 @@ using CLFullyConnectedLayerQuantizedFixture = FullyConnectedLayerValidationQuant TEST_SUITE(Quantized) TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine( - combine(datasets::SmallFullyConnectedLayerDataset(), - FullyConnectedParameters), - framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 10) }))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData)) { // Validate output validate(CLAccessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine( - combine(datasets::LargeFullyConnectedLayerDataset(), - FullyConnectedParameters), - framework::dataset::make("DataType", DataType::QASYMM8)), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 256.f, 10) }))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData)) { // Validate output validate(CLAccessor(_target), _reference, tolerance_qasymm8); diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h index 23918a4055..6ac5df93b3 100644 --- a/tests/validation/reference/Convolution3d.h +++ b/tests/validation/reference/Convolution3d.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__ -#define __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__ +#ifndef ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H +#define ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "tests/validation/Helpers.h" @@ -133,7 +133,7 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &wei int output_multiplier = 0; int output_shift = 0; const float multiplier = input_scale * weights_scale / output_scale; - arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); const int half_width_weights_start = width_weights / 2; const int half_width_weights_end = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start; @@ -171,9 +171,8 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &wei // Accumulate the bias acc += (*b_ptr); - acc = validation::asymm_rounding_divide_by_pow2(validation::asymm_int_mult(acc, output_multiplier), output_shift); - acc += output_offset; - acc = utility::clamp(acc, 0, 255); + // Quantize down + acc = validation::quantize_down_scale_by_fixedpoint(acc, output_multiplier, output_shift, output_offset, 0, 255); // Store the result *out_ptr = acc; @@ -182,4 +181,4 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &wei } // namespace convolution_3d } // namespace test } // namespace arm_compute -#endif /*__ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__ */ +#endif /* ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H */ diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp index 608093d381..0c7e92b8d0 100644 --- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp @@ -197,7 +197,7 @@ SimpleTensor depthwise_convolution_quantized(const SimpleTensor &src, cons int output_shift = 0; const float weights_scale = (is_quantized_per_channel) ? weights_scale_vec[out_z] : weights_scale_vec[0]; const float multiplier = input_scale * weights_scale / output_scale; - arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); for(int y = minimum_y; y <= minimum_y + maximum_y; y += conv_info.stride().second) { @@ -220,9 +220,8 @@ SimpleTensor depthwise_convolution_quantized(const SimpleTensor &src, cons } } val += bias_val; - val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift); - val += output_offset; - val = utility::clamp(val, 0, 255); + // Quantize down + val = quantize_down_scale_by_fixedpoint(val, output_multiplier, output_shift, output_offset, 0, 255); // Store the result dst[out_pos++] = val; diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp index cd84b9cfd1..261c6453b9 100644 --- a/tests/validation/reference/FullyConnectedLayer.cpp +++ b/tests/validation/reference/FullyConnectedLayer.cpp @@ -81,7 +81,7 @@ void vector_matrix_multiply(const SimpleTensor &src, const SimpleTensor &w int output_multiplier = 0; int output_shift = 0; const float multiplier = input_scale * weights_scale / output_scale; - arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); for(int y = 0; y < rows_weights; ++y) { @@ -96,9 +96,8 @@ void vector_matrix_multiply(const SimpleTensor &src, const SimpleTensor &w // Accumulate the bias acc += bias_ptr[y]; - acc = asymm_rounding_divide_by_pow2(asymm_int_mult(acc, output_multiplier), output_shift); - acc += output_offset; - acc = utility::clamp(acc, 0, 255); + // Quantize down + acc = quantize_down_scale_by_fixedpoint(acc, output_multiplier, output_shift, output_offset, 0, 255); // Store the result dst_ptr[y] = static_cast(acc); diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h index b7b69d588a..444696c93a 100644 --- a/tests/validation/reference/UtilsQuantizedAsymm.h +++ b/tests/validation/reference/UtilsQuantizedAsymm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H__ -#define __ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H__ +#ifndef ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H +#define ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H #include @@ -51,7 +51,25 @@ inline int32_t asymm_int_mult(int32_t a, int32_t b) int32_t ab_x2_high32 = static_cast((ab_64 + nudge) / (1ll << 31)); return overflow ? std::numeric_limits::max() : ab_x2_high32; } + +/** Quantize down the input value in range [min, max]. */ +inline int32_t quantize_down_scale_by_fixedpoint(int32_t val, int32_t result_mult_int, int32_t result_shift, + int32_t result_offset_after_shift, int32_t min, int32_t max) +{ + int32_t res = 0; + if(result_shift < 0) + { + res = asymm_int_mult(val * (1 << (-result_shift)), result_mult_int); + } + else + { + res = asymm_rounding_divide_by_pow2(asymm_int_mult(val, result_mult_int), result_shift); + } + res += result_offset_after_shift; + res = utility::clamp(res, min, max); + return res; +} } // namespace validation } // namespace test } // namespace arm_compute -#endif /* __ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H__ */ +#endif /* ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H */ -- cgit v1.2.1