diff options
author | Michele Di Giorgio <michele.digiorgio@arm.com> | 2019-10-23 10:53:10 +0100 |
---|---|---|
committer | Michele Di Giorgio <michele.digiorgio@arm.com> | 2019-11-28 10:02:15 +0000 |
commit | 14cbfb2921990d8bf125231e350e2ac8dcd95a8b (patch) | |
tree | 9bec073d72c44c480c8807601889481d9b89ee7e /src/core/CL/cl_kernels | |
parent | ed7b27dd7cbdae57b880029840ad0235523848e0 (diff) | |
download | ComputeLibrary-14cbfb2921990d8bf125231e350e2ac8dcd95a8b.tar.gz |
COMPMID-2609: Enable quantization with multiplier greater than 1 on OpenCL
Change-Id: I050f1f84e214e61f7cbb0197a672b68a4940edae
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2158
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/depthwise_convolution_quantized.cl | 146 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/direct_convolution_quantized.cl | 18 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/gemmlowp.cl | 22 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/helpers_asymm.h | 2 |
4 files changed, 137 insertions, 51 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 08358755b1..ac1406b6d8 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -47,7 +47,7 @@ #define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size) -#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) +#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) #if defined(WEIGHTS_PROMOTED_TYPE) #define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size) @@ -224,9 +224,6 @@ __kernel void dwc_3x3_native_quantized8_nchw( #if defined(PER_CHANNEL_QUANTIZATION) const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel)); const int output_shift = *((__global int *)vector_offset(&output_shifts, channel)); -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0)); - const int output_shift = *((__global int *)vector_offset(&output_shifts, 0)); #endif // defined(PER_CHANNEL_QUANTIZATION) int8 values0 = 0; @@ -335,7 +332,17 @@ __kernel void dwc_3x3_native_quantized8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8); + int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); + values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -351,7 +358,17 @@ __kernel void dwc_3x3_native_quantized8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8); + int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); + values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -667,7 +684,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8); + int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); + values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -684,7 +711,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); +#if defined(PER_CHANNEL_QUANTIZATION) + int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8); + int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); + values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_OFFSET < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -943,17 +980,23 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc( acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); #else // defined(REAL_MULTIPLIER) + #if defined(PER_CHANNEL_QUANTIZATION) Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers); Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); -#endif // defined(PER_CHANNEL_QUANTIZATION) - acc = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift); + VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE); + VEC_INT res_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift); + acc = select(res_shift_lt0, res_shift_gt0, output_shift >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + acc = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE); +#else // OUTPUT_SHIFT < 0 + acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // OUTPUT_SHIFT < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -1255,15 +1298,32 @@ __kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc( Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); -#endif // defined(PER_CHANNEL_QUANTIZATION) - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); - acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); - acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); + res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE); + res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE); + res2_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE); + res3_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE); + res0_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + res1_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); + res2_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); + res3_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); + acc0 = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0); + acc1 = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0); + acc2 = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0); + acc3 = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE); + acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE); + acc2 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE); + acc3 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE); +#else // OUTPUT_SHIFT < 0 + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); + acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); + acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); +#endif // OUTPUT_SHIFT < 0 +#endif // defined(PER_CHANNEL_QUANTIZATION) #endif // defined(REAL_MULTIPLIER) @@ -1375,7 +1435,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -1383,7 +1443,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -1519,11 +1579,14 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); #else // defined(REAL_MULTIPLIER) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); +#if OUTPUT_SHIFT < 0 + acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE); + acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE); +#else // OUTPUT_SHIFT < 0 + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#endif // OUTPUT_SHIFT < 0 #endif // defined(REAL_MULTIPLIER) acc0 += (VEC_INT)OUTPUT_OFFSET; @@ -1553,9 +1616,9 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( #endif // defined(WEIGHTS_PROMOTED_TYPE) -#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) +#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) -#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) +#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped * * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2) @@ -1630,7 +1693,7 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0; @@ -1646,12 +1709,6 @@ __kernel void dwc_MxN_native_quantized8_nhwc( #if defined(PER_CHANNEL_QUANTIZATION) __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; - - VEC_INT output_multiplier = (VEC_INT)0; - VEC_INT output_shift = (VEC_INT)0; -#else // defined(PER_CHANNEL_QUANTIZATION) - const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); - const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); #endif // defined(PER_CHANNEL_QUANTIZATION) #if defined(DST_DEPTH) @@ -1700,11 +1757,20 @@ __kernel void dwc_MxN_native_quantized8_nhwc( #endif // defined(HAS_BIAS) #if defined(PER_CHANNEL_QUANTIZATION) - output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr)); - output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr)); + VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr)); + VEC_INT output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr)); + + VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0); + VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0); + res = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) +#if OUTPUT_SHIFT < 0 + res = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0); +#else // OUTPUT_SHIFT < 0 + res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0); +#endif // OUTPUT_OFFSET < 0 #endif // defined(PER_CHANNEL_QUANTIZATION) - res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0); res += (VEC_INT)OUTPUT_OFFSET; VEC_TYPE(VEC_SIZE) @@ -1726,5 +1792,5 @@ __kernel void dwc_MxN_native_quantized8_nhwc( } #endif // DEPTH_MULTIPLIER > 1 } -#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) +#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE) diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl index 1182428cd5..37fd9a0778 100644 --- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/direct_convolution_quantized.cl @@ -25,7 +25,7 @@ #undef CONVERT_SAT -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) +#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) #if KERNEL_SIZE == 9 @@ -194,6 +194,8 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel) * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1 * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH * @note If biases are used then -DHAS_BIAS has to be passed at compile time + * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234 + * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -227,8 +229,6 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel) * @param[in] input_offset Input offset quantization parameter * @param[in] weight_offset Weights offset quantization parameter * @param[in] output_offset Output offset quantization parameter - * @param[in] output_multiplier Output integer multiplier quantization parameter - * @param[in] output_shift Output integer shift quantization parameter */ __kernel void direct_convolution_quantized( TENSOR3D_DECLARATION(src), @@ -240,9 +240,7 @@ __kernel void direct_convolution_quantized( unsigned int weights_stride_w, int input_offset, int weight_offset, - int output_offset, - int output_multiplier, - int output_shift) + int output_offset) { Image src = CONVERT_TO_IMAGE_STRUCT(src); Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); @@ -294,9 +292,13 @@ __kernel void direct_convolution_quantized( pixels0 += (int8)(*bias_addr); #endif /* defined(HAS_BIAS) */ - pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, output_multiplier, output_shift, 8); +#if OUTPUT_SHIFT < 0 + pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#else // OUTPUT_SHIFT < 0 + pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); +#endif // OUTPUT_SHIFT < 0 pixels0 = pixels0 + output_offset; vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr); } -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) +#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 7a97fa6fa1..fa08b149e4 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -1673,9 +1673,17 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr); int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr); - in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4); -#else // !defined(PER_CHANNEL_QUANTIZATION) + int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4); + int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4); + in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) + +#if RESULT_SHIFT < 0 + in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + #endif // defined(PER_CHANNEL_QUANTIZATION) // Add the offset terms to GEMM's result @@ -1768,7 +1776,11 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), // Multiply by result_mult_int and shift input_values *= RESULT_MULT_INT; +#if RESULT_SHIFT < 0 + input_values >>= -RESULT_SHIFT; +#else // RESULT_SHIFT >= 0 input_values >>= RESULT_SHIFT; +#endif // RESULT_SHIFT < 0 uchar4 res = convert_uchar4_sat(input_values); @@ -1850,7 +1862,11 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO #endif // defined(ADD_BIAS) // Multiply by result_mult_int and shift +#if RESULT_SHIFT < 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 // Add the offset terms to GEMM's result input_values += (int4)RESULT_OFFSET_AFTER_SHIFT; @@ -1937,7 +1953,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE // Multiply by result_mult_int and shift #if RESULT_SHIFT < 0 - input_values = ASYMM_MULT(input_values * (1 << (-RESULT_SHIFT)), RESULT_FIXEDPOINT_MULTIPLIER, 4); + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); #else // RESULT_SHIFT >= 0 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); #endif // RESULT_SHIFT < 0 diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index f7eff758c0..09409dc5e9 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -369,6 +369,8 @@ inline float dequantize_qasymm8(uchar input, float offset, float scale) #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ + ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) |