aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2019-10-23 10:53:10 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2019-11-28 10:02:15 +0000
commit14cbfb2921990d8bf125231e350e2ac8dcd95a8b (patch)
tree9bec073d72c44c480c8807601889481d9b89ee7e /src
parented7b27dd7cbdae57b880029840ad0235523848e0 (diff)
downloadComputeLibrary-14cbfb2921990d8bf125231e350e2ac8dcd95a8b.tar.gz
COMPMID-2609: Enable quantization with multiplier greater than 1 on OpenCL
Change-Id: I050f1f84e214e61f7cbb0197a672b68a4940edae Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/2158 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Manuel Bottini <manuel.bottini@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl146
-rw-r--r--src/core/CL/cl_kernels/direct_convolution_quantized.cl18
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl22
-rw-r--r--src/core/CL/cl_kernels/helpers_asymm.h2
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp20
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp74
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp6
-rw-r--r--src/core/utils/quantization/AsymmHelpers.cpp3
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp4
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp2
-rw-r--r--src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp17
13 files changed, 237 insertions, 93 deletions
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 08358755b1..ac1406b6d8 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -47,7 +47,7 @@
#define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
#if defined(WEIGHTS_PROMOTED_TYPE)
#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
@@ -224,9 +224,6 @@ __kernel void dwc_3x3_native_quantized8_nchw(
#if defined(PER_CHANNEL_QUANTIZATION)
const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
const int output_shift = *((__global int *)vector_offset(&output_shifts, channel));
-#else // defined(PER_CHANNEL_QUANTIZATION)
- const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
- const int output_shift = *((__global int *)vector_offset(&output_shifts, 0));
#endif // defined(PER_CHANNEL_QUANTIZATION)
int8 values0 = 0;
@@ -335,7 +332,17 @@ __kernel void dwc_3x3_native_quantized8_nchw(
#else // defined(REAL_MULTIPLIER)
- values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+ int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+ int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+ values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else // OUTPUT_SHIFT < 0
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
#endif // defined(REAL_MULTIPLIER)
@@ -351,7 +358,17 @@ __kernel void dwc_3x3_native_quantized8_nchw(
#else // defined(REAL_MULTIPLIER)
- values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+ int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+ int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+ values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else // OUTPUT_SHIFT < 0
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
#endif // defined(REAL_MULTIPLIER)
@@ -667,7 +684,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw(
#else // defined(REAL_MULTIPLIER)
- values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+ int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+ int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+ values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else // OUTPUT_SHIFT < 0
+ values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
#endif // defined(REAL_MULTIPLIER)
@@ -684,7 +711,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw(
#else // defined(REAL_MULTIPLIER)
- values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+ int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+ int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+ values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else // OUTPUT_SHIFT < 0
+ values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
#endif // defined(REAL_MULTIPLIER)
@@ -943,17 +980,23 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc(
acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
#else // defined(REAL_MULTIPLIER)
+
#if defined(PER_CHANNEL_QUANTIZATION)
Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts);
VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-#else // defined(PER_CHANNEL_QUANTIZATION)
- const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
- const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
- acc = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+ VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE);
+ VEC_INT res_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+ acc = select(res_shift_lt0, res_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ acc = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else // OUTPUT_SHIFT < 0
+ acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
#endif // defined(REAL_MULTIPLIER)
@@ -1255,15 +1298,32 @@ __kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(
Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts);
VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-#else // defined(PER_CHANNEL_QUANTIZATION)
- const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
- const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
- acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
- acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+ res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+ res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+ res2_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+ res3_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+ res0_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+ res1_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+ res2_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+ res3_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+ acc0 = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);
+ acc1 = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);
+ acc2 = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);
+ acc3 = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+ acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+ acc2 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+ acc3 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+#else // OUTPUT_SHIFT < 0
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+ acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+ acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
#endif // defined(REAL_MULTIPLIER)
@@ -1375,7 +1435,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
+ int z = get_global_id(2); // spatial coordinate y
#endif // defined(DST_DEPTH)
__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -1383,7 +1443,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
#if defined(DST_DEPTH)
__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
#else /* defined(DST_DEPTH) */
- __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
#endif /* defined(DST_DEPTH) */
int z_coord = 0;
@@ -1519,11 +1579,14 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
#else // defined(REAL_MULTIPLIER)
- const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
- const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
- acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
- acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+#if OUTPUT_SHIFT < 0
+ acc0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+ acc1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else // OUTPUT_SHIFT < 0
+ acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+ acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
#endif // defined(REAL_MULTIPLIER)
acc0 += (VEC_INT)OUTPUT_OFFSET;
@@ -1553,9 +1616,9 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
#endif // defined(WEIGHTS_PROMOTED_TYPE)
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))
+#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
*
* @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
@@ -1630,7 +1693,7 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
int b = get_global_id(2) / (int)DST_DEPTH; // batch
#else // defined(DST_DEPTH)
- int z = get_global_id(2); // spatial coordinate y
+ int z = get_global_id(2); // spatial coordinate y
#endif // defined(DST_DEPTH)
__global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
@@ -1646,12 +1709,6 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
#if defined(PER_CHANNEL_QUANTIZATION)
__global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
__global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
-
- VEC_INT output_multiplier = (VEC_INT)0;
- VEC_INT output_shift = (VEC_INT)0;
-#else // defined(PER_CHANNEL_QUANTIZATION)
- const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
- const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
#endif // defined(PER_CHANNEL_QUANTIZATION)
#if defined(DST_DEPTH)
@@ -1700,11 +1757,20 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
#endif // defined(HAS_BIAS)
#if defined(PER_CHANNEL_QUANTIZATION)
- output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
- output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+ VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
+ VEC_INT output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+
+ VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
+ VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
+ res = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+ res = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#else // OUTPUT_SHIFT < 0
+ res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#endif // OUTPUT_OFFSET < 0
#endif // defined(PER_CHANNEL_QUANTIZATION)
- res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
res += (VEC_INT)OUTPUT_OFFSET;
VEC_TYPE(VEC_SIZE)
@@ -1726,5 +1792,5 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
}
#endif // DEPTH_MULTIPLIER > 1
}
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
#endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
index 1182428cd5..37fd9a0778 100644
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
@@ -25,7 +25,7 @@
#undef CONVERT_SAT
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
#if KERNEL_SIZE == 9
@@ -194,6 +194,8 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
* @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
* @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
* @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -227,8 +229,6 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
* @param[in] input_offset Input offset quantization parameter
* @param[in] weight_offset Weights offset quantization parameter
* @param[in] output_offset Output offset quantization parameter
- * @param[in] output_multiplier Output integer multiplier quantization parameter
- * @param[in] output_shift Output integer shift quantization parameter
*/
__kernel void direct_convolution_quantized(
TENSOR3D_DECLARATION(src),
@@ -240,9 +240,7 @@ __kernel void direct_convolution_quantized(
unsigned int weights_stride_w,
int input_offset,
int weight_offset,
- int output_offset,
- int output_multiplier,
- int output_shift)
+ int output_offset)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
@@ -294,9 +292,13 @@ __kernel void direct_convolution_quantized(
pixels0 += (int8)(*bias_addr);
#endif /* defined(HAS_BIAS) */
- pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, output_multiplier, output_shift, 8);
+#if OUTPUT_SHIFT < 0
+ pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else // OUTPUT_SHIFT < 0
+ pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_SHIFT < 0
pixels0 = pixels0 + output_offset;
vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 7a97fa6fa1..fa08b149e4 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1673,9 +1673,17 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
- in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
-#else // !defined(PER_CHANNEL_QUANTIZATION)
+ int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+ int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+ in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+ in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else // RESULT_SHIFT >= 0
in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
#endif // defined(PER_CHANNEL_QUANTIZATION)
// Add the offset terms to GEMM's result
@@ -1768,7 +1776,11 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
// Multiply by result_mult_int and shift
input_values *= RESULT_MULT_INT;
+#if RESULT_SHIFT < 0
+ input_values >>= -RESULT_SHIFT;
+#else // RESULT_SHIFT >= 0
input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
uchar4 res = convert_uchar4_sat(input_values);
@@ -1850,7 +1862,11 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
#endif // defined(ADD_BIAS)
// Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else // RESULT_SHIFT >= 0
input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
// Add the offset terms to GEMM's result
input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
@@ -1937,7 +1953,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
// Multiply by result_mult_int and shift
#if RESULT_SHIFT < 0
- input_values = ASYMM_MULT(input_values * (1 << (-RESULT_SHIFT)), RESULT_FIXEDPOINT_MULTIPLIER, 4);
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
#else // RESULT_SHIFT >= 0
input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
#endif // RESULT_SHIFT < 0
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index f7eff758c0..09409dc5e9 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -369,6 +369,8 @@ inline float dequantize_qasymm8(uchar input, float offset, float scale)
#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index a2f4a913ce..d9705de5a4 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -297,6 +297,14 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input,
build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
+ // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
if(act_info.enabled())
{
const int a_val = quantize_qasymm8(act_info.a(), oq_info);
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index d5f37f32ce..8db85edc62 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -260,6 +260,14 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input,
build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
+ // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
if(act_info.enabled())
{
const int a_val = quantize_qasymm8(act_info.a(), oq_info);
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 3fc236eaa7..2155306d62 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -107,6 +107,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
}
+ if(is_data_type_quantized(input->data_type()))
+ {
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ }
+
return Status{};
}
@@ -236,6 +248,14 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input,
build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
+ // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
if(dwc_info.activation_info.enabled())
{
const int a_val = quantize_qasymm8(dwc_info.activation_info.a(), oq_info);
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 7b74a5a98c..e61e5c3901 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -38,8 +38,8 @@
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
@@ -69,7 +69,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
{
const auto supported_data_layout = is_data_type_quantized(data_type) ? DataLayout::NCHW : DataLayout::NHWC;
const auto error_message = std::string("Only " + string_from_data_layout(supported_data_layout) + " layout is supported for 9x9 convolution with " + string_from_data_type(
- data_type) + " type");
+ data_type)
+ + " type");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((supported_data_layout != data_layout), error_message.c_str());
}
@@ -98,6 +99,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
+ if(is_data_type_quantized(data_type))
+ {
+ const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ }
return Status{};
}
@@ -483,8 +495,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
}
else
{
- const bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
- build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
@@ -508,9 +518,35 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
}
}
build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_quantized" : kernel_name.str(),
- build_options.options()));
+
+ if(is_data_type_quantized(data_type))
+ {
+ const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
+
+ float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+ build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+ build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+ build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("direct_convolution_quantized", build_options.options()));
+
+ // Set static kernel arguments
+ unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
+ _kernel.setArg(idx++, -iqinfo.offset);
+ _kernel.setArg(idx++, -wqinfo.offset);
+ _kernel.setArg(idx++, oqinfo.offset);
+ }
+ else
+ {
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
+ }
}
// Configure kernel window
@@ -518,27 +554,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICLKernel::configure_internal(win_config.second);
- // Set static kernel arguments
- if(is_data_type_quantized_asymmetric(data_type))
- {
- const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
- int output_multiplier = 0;
- int output_shift = 0;
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
-
- unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
- _kernel.setArg(idx++, -iqinfo.offset);
- _kernel.setArg(idx++, -wqinfo.offset);
- _kernel.setArg(idx++, oqinfo.offset);
- _kernel.setArg(idx++, output_multiplier);
- _kernel.setArg(idx++, output_shift);
- }
-
// Set config_id for enabling LWS tuning
_config_id = "direct_convolution_";
_config_id += lower_string(string_from_data_type(data_type));
@@ -614,3 +629,4 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue
}
while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 7c066381ce..a98eae673b 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -35,8 +35,6 @@
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
namespace arm_compute
{
namespace
@@ -100,9 +98,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-class Coordinates;
-} // namespace arm_compute
-
CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
: _input(nullptr), _bias(nullptr), _output(nullptr)
{
@@ -180,3 +175,4 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
}
while(collapsed.slide_window_slice_3D(slice));
}
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 7e22a814b5..0551874d5f 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -191,8 +191,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
int output_multiplier = 0;
int output_shift = 0;
const float multiplier = iq_info.scale * wq_info.scale()[i] / oq_info.scale;
- ARM_COMPUTE_ERROR_ON(multiplier > 1.0f);
- calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
output_multipliers_ptr[i] = output_multiplier;
output_shifts_ptr[i] = output_shift;
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index d794cde1f4..c271f502e9 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -34,7 +34,8 @@
#include <memory>
#include <tuple>
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -216,3 +217,4 @@ void CLConvolutionLayer::prepare()
{
_function->prepare();
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index a8167ce8f7..ad0714ed15 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -60,7 +60,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn
const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
int output_multiplier = 0;
int output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
// Set the GEMMLowp output stage info
gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 4671be5b61..604147a37a 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -152,9 +152,9 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
}
const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
- auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
- const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
- TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+ auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+ const TensorShape deconv_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+ TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
if(padded_input && is_quantized)
{
@@ -173,6 +173,15 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
else if(is_quantized)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ int output_multiplier(0);
+ int output_shift(0);
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
}
else
@@ -284,7 +293,7 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
int output_multiplier(0);
int output_shift(0);
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, oq_info.offset);
_gemmlowp_final.allocator()->allocate();
}