From 14cbfb2921990d8bf125231e350e2ac8dcd95a8b Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 23 Oct 2019 10:53:10 +0100
Subject: COMPMID-2609: Enable quantization with multiplier greater than 1 on
 OpenCL

Change-Id: I050f1f84e214e61f7cbb0197a672b68a4940edae
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2158
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
---
 .../runtime/CL/functions/CLGEMMConvolutionLayer.h  |  11 +-
 .../CL/functions/CLGEMMDeconvolutionLayer.h        |   7 +-
 .../cl_kernels/depthwise_convolution_quantized.cl  | 146 +++++++++++++++------
 .../CL/cl_kernels/direct_convolution_quantized.cl  |  18 +--
 src/core/CL/cl_kernels/gemmlowp.cl                 |  22 +++-
 src/core/CL/cl_kernels/helpers_asymm.h             |   2 +
 .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp   |   8 ++
 .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp   |   8 ++
 .../CLDepthwiseConvolutionLayerNativeKernel.cpp    |  20 +++
 .../CL/kernels/CLDirectConvolutionLayerKernel.cpp  |  74 +++++++----
 ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp |   6 +-
 src/core/utils/quantization/AsymmHelpers.cpp       |   3 +-
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |   4 +-
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |   2 +-
 .../CL/functions/CLGEMMDeconvolutionLayer.cpp      |  17 ++-
 tests/validate_examples/cl_gemm.cpp                |   8 +-
 tests/validation/CL/ConvolutionLayer.cpp           |   2 +-
 tests/validation/CL/DeconvolutionLayer.cpp         |  24 ++--
 tests/validation/CL/DepthwiseConvolutionLayer.cpp  |  16 +--
 tests/validation/CL/DirectConvolutionLayer.cpp     |  10 +-
 tests/validation/CL/FullyConnectedLayer.cpp        |  20 +--
 tests/validation/reference/Convolution3d.h         |  13 +-
 .../reference/DepthwiseConvolutionLayer.cpp        |   7 +-
 tests/validation/reference/FullyConnectedLayer.cpp |   7 +-
 tests/validation/reference/UtilsQuantizedAsymm.h   |  26 +++-
 25 files changed, 319 insertions(+), 162 deletions(-)

diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 3392f11b06..ce034cd9ba 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -21,14 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__
-#define __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__
+#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
+#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
 
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
@@ -36,8 +34,6 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/ITransformWeights.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -143,7 +139,6 @@ private:
  * -# @ref CLGEMM (if the data type is FP32 or FP16)
  * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8)
  * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8)
- * -# @ref CLElementwiseOperationKernel for addition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
  * -# @ref CLCol2ImKernel (if NCHW data layout)
  */
 class CLGEMMConvolutionLayer : public IFunction
@@ -267,4 +262,4 @@ private:
     bool _is_prepared;
 };
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ */
+#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index 0a71995158..3df9205f48 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H__
-#define __ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H__
+#ifndef ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H
+#define ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H
 
 #include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
@@ -141,4 +142,4 @@ private:
     bool             _is_quantized;
 };
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H__ */
+#endif /* ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H */
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 08358755b1..ac1406b6d8 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -47,7 +47,7 @@
 
 #define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
 
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
 
 #if defined(WEIGHTS_PROMOTED_TYPE)
 #define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
@@ -224,9 +224,6 @@ __kernel void dwc_3x3_native_quantized8_nchw(
 #if defined(PER_CHANNEL_QUANTIZATION)
     const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
     const int output_shift      = *((__global int *)vector_offset(&output_shifts, channel));
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
-    const int output_shift      = *((__global int *)vector_offset(&output_shifts, 0));
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
     int8 values0 = 0;
@@ -335,7 +332,17 @@ __kernel void dwc_3x3_native_quantized8_nchw(
 
 #else // defined(REAL_MULTIPLIER)
 
-    values0                            = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res0_shift_lt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    int8 res0_shift_gt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    values0                            = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -351,7 +358,17 @@ __kernel void dwc_3x3_native_quantized8_nchw(
 
 #else // defined(REAL_MULTIPLIER)
 
-    values1                           = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res1_shift_lt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    int8 res1_shift_gt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    values1                  = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -667,7 +684,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw(
 
 #else // defined(REAL_MULTIPLIER)
 
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    values0             = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -684,7 +711,17 @@ __kernel void dwc_3x3_native_quantized8_dot8_nchw(
 
 #else // defined(REAL_MULTIPLIER)
 
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    values1             = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -943,17 +980,23 @@ __kernel void dwc_3x3_reshaped_quantized8_nhwc(
     acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
 
 #else // defined(REAL_MULTIPLIER)
+
 #if defined(PER_CHANNEL_QUANTIZATION)
     Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
     Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
     VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
     VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
 
-    acc                                = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+    VEC_INT res_shift_lt0              = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res_shift_gt0              = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+    acc                                = select(res_shift_lt0, res_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    acc     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc     = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -1255,15 +1298,32 @@ __kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(
     Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
     VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
     VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
 
-    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
-    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
-    acc2                     = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
-    acc3                     = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+    res0_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+    res1_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+    res2_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+    res3_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+    res0_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+    res1_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+    res2_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+    res3_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+    acc0                     = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);
+    acc1                     = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);
+    acc2                     = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);
+    acc3                     = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    acc0    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+    acc1    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+    acc2    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+    acc3    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+    acc2    = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+    acc3    = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -1375,7 +1435,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
 #else                                          // defined(DST_DEPTH)
-    int      z                        = get_global_id(2); // spatial coordinate y
+    int      z               = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -1383,7 +1443,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
 #if defined(DST_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
 #else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr          = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
 #endif /* defined(DST_DEPTH) */
 
     int  z_coord = 0;
@@ -1519,11 +1579,14 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
     acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
 
 #else // defined(REAL_MULTIPLIER)
-    const int       output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int       output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
 
-    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
-    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+#if OUTPUT_SHIFT < 0
+    acc0                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+    acc1                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
 
 #endif // defined(REAL_MULTIPLIER)
     acc0 += (VEC_INT)OUTPUT_OFFSET;
@@ -1553,9 +1616,9 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
 
 #endif // defined(WEIGHTS_PROMOTED_TYPE)
 
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))
+#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
 
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
 /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
  *
  * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
@@ -1630,7 +1693,7 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
 #else                                          // defined(DST_DEPTH)
-    int       z                 = get_global_id(2); // spatial coordinate y
+    int z = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
     __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
@@ -1646,12 +1709,6 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
 #if defined(PER_CHANNEL_QUANTIZATION)
     __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
     __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
-
-    VEC_INT output_multiplier = (VEC_INT)0;
-    VEC_INT output_shift      = (VEC_INT)0;
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #if defined(DST_DEPTH)
@@ -1700,11 +1757,20 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
 #endif // defined(HAS_BIAS)
 
 #if defined(PER_CHANNEL_QUANTIZATION)
-        output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
-        output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+        VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
+        VEC_INT output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+
+        VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
+        VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
+        res                   = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+        res   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#else  // OUTPUT_SHIFT < 0
+        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#endif // OUTPUT_OFFSET < 0
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
-        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
         res += (VEC_INT)OUTPUT_OFFSET;
 
         VEC_TYPE(VEC_SIZE)
@@ -1726,5 +1792,5 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
     }
 #endif // DEPTH_MULTIPLIER > 1
 }
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
 #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
index 1182428cd5..37fd9a0778 100644
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
@@ -25,7 +25,7 @@
 
 #undef CONVERT_SAT
 
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
 
 #if KERNEL_SIZE == 9
 
@@ -194,6 +194,8 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
  * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
  * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
  * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
  *
  * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
  * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
@@ -227,8 +229,6 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
  * @param[in]  input_offset                          Input offset quantization parameter
  * @param[in]  weight_offset                         Weights offset quantization parameter
  * @param[in]  output_offset                         Output offset quantization parameter
- * @param[in]  output_multiplier                     Output integer multiplier quantization parameter
- * @param[in]  output_shift                          Output integer shift quantization parameter
  */
 __kernel void direct_convolution_quantized(
     TENSOR3D_DECLARATION(src),
@@ -240,9 +240,7 @@ __kernel void direct_convolution_quantized(
     unsigned int weights_stride_w,
     int          input_offset,
     int          weight_offset,
-    int          output_offset,
-    int          output_multiplier,
-    int          output_shift)
+    int          output_offset)
 {
     Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
@@ -294,9 +292,13 @@ __kernel void direct_convolution_quantized(
     pixels0 += (int8)(*bias_addr);
 #endif /* defined(HAS_BIAS) */
 
-    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, output_multiplier, output_shift, 8);
+#if OUTPUT_SHIFT < 0
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_SHIFT < 0
     pixels0 = pixels0 + output_offset;
 
     vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
 }
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 7a97fa6fa1..fa08b149e4 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1673,9 +1673,17 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
     int4            result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
     int4            result_shifts_values      = vload4(0, (__global int *)result_shifts_addr);
 
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
-#else  // !defined(PER_CHANNEL_QUANTIZATION)
+    int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+    int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+    in_s32                = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
     in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
     // Add the offset terms to GEMM's result
@@ -1768,7 +1776,11 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
     // Multiply by result_mult_int and shift
     input_values *= RESULT_MULT_INT;
 
+#if RESULT_SHIFT < 0
+    input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
     input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
 
     uchar4 res = convert_uchar4_sat(input_values);
 
@@ -1850,7 +1862,11 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
     input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
 
     // Add the offset terms to GEMM's result
     input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
@@ -1937,7 +1953,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
 
     // Multiply by result_mult_int and shift
 #if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT(input_values * (1 << (-RESULT_SHIFT)), RESULT_FIXEDPOINT_MULTIPLIER, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
 #else  // RESULT_SHIFT >= 0
     input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
 #endif // RESULT_SHIFT < 0
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index f7eff758c0..09409dc5e9 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -369,6 +369,8 @@ inline float dequantize_qasymm8(uchar input, float offset, float scale)
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
 #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index a2f4a913ce..d9705de5a4 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -297,6 +297,14 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input,
         build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
         build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
 
+        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
+        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
         if(act_info.enabled())
         {
             const int a_val = quantize_qasymm8(act_info.a(), oq_info);
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index d5f37f32ce..8db85edc62 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -260,6 +260,14 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input,
         build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
         build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8");
 
+        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
+        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
         if(act_info.enabled())
         {
             const int a_val = quantize_qasymm8(act_info.a(), oq_info);
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 3fc236eaa7..2155306d62 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -107,6 +107,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
 
+    if(is_data_type_quantized(input->data_type()))
+    {
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+
+        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    }
+
     return Status{};
 }
 
@@ -236,6 +248,14 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input,
         build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
         build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
 
+        // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
+        float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
         if(dwc_info.activation_info.enabled())
         {
             const int a_val = quantize_qasymm8(dwc_info.activation_info.a(), oq_info);
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 7b74a5a98c..e61e5c3901 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -38,8 +38,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
@@ -69,7 +69,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     {
         const auto supported_data_layout = is_data_type_quantized(data_type) ? DataLayout::NCHW : DataLayout::NHWC;
         const auto error_message         = std::string("Only " + string_from_data_layout(supported_data_layout) + " layout is supported for 9x9 convolution with " + string_from_data_type(
-                                                           data_type) + " type");
+                                                           data_type)
+                                                       + " type");
 
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((supported_data_layout != data_layout), error_message.c_str());
     }
@@ -98,6 +99,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
+    if(is_data_type_quantized(data_type))
+    {
+        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    }
     return Status{};
 }
 
@@ -483,8 +495,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     }
     else
     {
-        const bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
-        build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
@@ -508,9 +518,35 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
             }
         }
         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-        // Create kernel
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_quantized" : kernel_name.str(),
-                                                                               build_options.options()));
+
+        if(is_data_type_quantized(data_type))
+        {
+            const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
+            const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
+            const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
+
+            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+            int   output_multiplier = 0;
+            int   output_shift      = 0;
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+            build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+            build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+            build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+
+            // Create kernel
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("direct_convolution_quantized", build_options.options()));
+
+            // Set static kernel arguments
+            unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
+            _kernel.setArg(idx++, -iqinfo.offset);
+            _kernel.setArg(idx++, -wqinfo.offset);
+            _kernel.setArg(idx++, oqinfo.offset);
+        }
+        else
+        {
+            // Create kernel
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
+        }
     }
 
     // Configure kernel window
@@ -518,27 +554,6 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
-    // Set static kernel arguments
-    if(is_data_type_quantized_asymmetric(data_type))
-    {
-        const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();
-
-        int output_multiplier = 0;
-        int output_shift      = 0;
-
-        float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
-
-        unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
-        _kernel.setArg(idx++, -iqinfo.offset);
-        _kernel.setArg(idx++, -wqinfo.offset);
-        _kernel.setArg(idx++, oqinfo.offset);
-        _kernel.setArg(idx++, output_multiplier);
-        _kernel.setArg(idx++, output_shift);
-    }
-
     // Set config_id for enabling LWS tuning
     _config_id = "direct_convolution_";
     _config_id += lower_string(string_from_data_type(data_type));
@@ -614,3 +629,4 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue
     }
     while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
 }
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 7c066381ce..a98eae673b 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -35,8 +35,6 @@
 
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
 namespace
@@ -100,9 +98,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-class Coordinates;
-} // namespace arm_compute
-
 CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
     : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
@@ -180,3 +175,4 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 7e22a814b5..0551874d5f 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -191,8 +191,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
         int         output_multiplier = 0;
         int         output_shift      = 0;
         const float multiplier        = iq_info.scale * wq_info.scale()[i] / oq_info.scale;
-        ARM_COMPUTE_ERROR_ON(multiplier > 1.0f);
-        calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
         output_multipliers_ptr[i] = output_multiplier;
         output_shifts_ptr[i]      = output_shift;
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index d794cde1f4..c271f502e9 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -34,7 +34,8 @@
 #include <memory>
 #include <tuple>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -216,3 +217,4 @@ void CLConvolutionLayer::prepare()
 {
     _function->prepare();
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index a8167ce8f7..ad0714ed15 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -60,7 +60,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn
         const float multiplier        = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         // Set the GEMMLowp output stage info
         gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 4671be5b61..604147a37a 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -152,9 +152,9 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     }
 
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
-    const TensorShape deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
-    TensorInfo        col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+    auto                out_dims           = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+    const TensorShape   deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo          col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
     if(padded_input && is_quantized)
     {
@@ -173,6 +173,15 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     else if(is_quantized)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+
+        float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+        int   output_multiplier(0);
+        int   output_shift(0);
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
     }
     else
@@ -284,7 +293,7 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
         _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, oq_info.offset);
         _gemmlowp_final.allocator()->allocate();
     }
diff --git a/tests/validate_examples/cl_gemm.cpp b/tests/validate_examples/cl_gemm.cpp
index 39fe111448..cdf60cd65b 100644
--- a/tests/validate_examples/cl_gemm.cpp
+++ b/tests/validate_examples/cl_gemm.cpp
@@ -195,11 +195,13 @@ public:
         consume_params(gemm_options);
         print_parameters_internal();
 
+        const bool is_quantized = is_data_type_quantized(data_type);
+
         // Calculate re-quantization parameters
-        if(data_type == DataType::QASYMM8)
+        if(is_quantized)
         {
             float multiplier = scale_src0 * scale_src1 / scale_dst;
-            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &dst_multiplier, &dst_shift);
+            quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
         }
 
         // Initialize GEMM inputs/outputs
@@ -209,7 +211,7 @@ public:
         init_sgemm_output(dst, src0, src1, data_type);
 
         // Configure function
-        if(data_type == DataType::QASYMM8)
+        if(is_quantized)
         {
             src0.info()->set_quantization_info(QuantizationInfo(scale_src0, offset_src0));
             src1.info()->set_quantization_info(QuantizationInfo(scale_src1, offset_src1));
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 9eb6c6d41d..5ee5d849c3 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -292,7 +292,7 @@ const auto QuantizationData = framework::dataset::make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
-    QuantizationInfo(1.f, 10),
+    QuantizationInfo(1.1f, 10),
 });
 TEST_SUITE(QASYMM8)
 
diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index ac8e170b80..090db1c2c1 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp
@@ -123,19 +123,19 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
 // *INDENT-ON*
 
 template <typename T>
-using CLDeconvolutionLayerFixture4x4      = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 4, 4>;
+using CLDeconvolutionLayerFixture4x4 = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 4, 4>;
 
 template <typename T>
-using CLDeconvolutionLayerFixture3x3      = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 3, 3>;
+using CLDeconvolutionLayerFixture3x3 = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 3, 3>;
 
 template <typename T>
 using CLDeconvolutionLayerAsymmFixture3x3 = DeconvolutionValidationAsymmFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 3, 3>;
 
 template <typename T>
-using CLDeconvolutionLayerFixture2x2      = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 2, 2>;
+using CLDeconvolutionLayerFixture2x2 = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 2, 2>;
 
 template <typename T>
-using CLDeconvolutionLayerFixture1x1      = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 1, 1>;
+using CLDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture<CLTensor, CLAccessor, CLDeconvolutionLayer, T, 1, 1>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -286,8 +286,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        framework::dataset::make("DataType",
                                                 DataType::QASYMM8)),
                        data_layouts_dataset),
-                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
-                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
+                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f, 0) })),
+                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(3.f, 0) })),
                        add_bias_dataset))
 {
     // Validate output
@@ -297,8 +297,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        framework::dataset::make("DataType",
                                                 DataType::QASYMM8)),
                        data_layouts_dataset),
-                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
-                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
+                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f, 0) })),
+                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(3.f, 0) })),
                        add_bias_dataset))
 {
     // Validate output
@@ -310,8 +310,8 @@ TEST_SUITE(W2x2)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDeconvolutionLayerQuantizedFixture2x2<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(data2x2_precommit,
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        data_layouts_dataset),
-                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
-                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
+                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f, 0) })),
+                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(3.f, 0) })),
                        add_bias_dataset))
 {
     // Validate output
@@ -323,8 +323,8 @@ TEST_SUITE(W1x1)
 FIXTURE_DATA_TEST_CASE(Run, CLDeconvolutionLayerQuantizedFixture1x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType",
                                                                                                                        DataType::QASYMM8)),
                                                                                                                        data_layouts_dataset),
-                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f / 255.f, 0) })),
-                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(4.f / 255.f, 0) })),
+                                                                                                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 0), QuantizationInfo(2.f, 0) })),
+                                                                                                                       framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(3.f / 255.f, 0), QuantizationInfo(3.f, 0) })),
                                                                                                                        add_bias_dataset))
 {
     // Validate output
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 5d8fd200d3..e2cdf5403a 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -563,7 +563,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
                                                                        depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -574,7 +574,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                                                                        large_depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -586,7 +586,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                        depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -597,7 +597,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
                                                                        large_depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(1.3f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -611,7 +611,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
                                                                        depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -622,7 +622,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
                                                                        large_depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -634,7 +634,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                        depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
@@ -645,7 +645,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uin
                        combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
                                                                        large_depth_multipliers),
                                                                framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                                                       framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10), QuantizationInfo(2.2f, 10) })),
                                                framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                ActivationFunctionsDataset))
diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index 5007738785..15b6c838fa 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp
@@ -251,7 +251,7 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit, framework::dataset::make("DataType",
                        DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -260,7 +260,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_
 FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit_9x9,
                        framework::dataset::make("DataType",
                                                 DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -268,7 +268,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uin
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly, framework::dataset::make("DataType",
                        DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -276,7 +276,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_
 }
 FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_9x9, framework::dataset::make("DataType",
                        DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -288,7 +288,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_CustomDataset)
 FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::DirectConvolutionLayerDataset(),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index af45be2cf0..091d9411b7 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -60,6 +60,12 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 });
 
 const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+
+const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, 10),
+    QuantizationInfo(1.1f, 10),
+});
 } // namespace
 
 TEST_SUITE(CL)
@@ -204,20 +210,14 @@ using CLFullyConnectedLayerQuantizedFixture = FullyConnectedLayerValidationQuant
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(
-                           combine(datasets::LargeFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 256.f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters), framework::dataset::make("DataType", DataType::QASYMM8)), QuantizationData))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 23918a4055..6ac5df93b3 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
-#define __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
+#ifndef ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H
+#define ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "tests/validation/Helpers.h"
@@ -133,7 +133,7 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &wei
     int         output_multiplier = 0;
     int         output_shift      = 0;
     const float multiplier        = input_scale * weights_scale / output_scale;
-    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     const int half_width_weights_start  = width_weights / 2;
     const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
@@ -171,9 +171,8 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &wei
     // Accumulate the bias
     acc += (*b_ptr);
 
-    acc = validation::asymm_rounding_divide_by_pow2(validation::asymm_int_mult(acc, output_multiplier), output_shift);
-    acc += output_offset;
-    acc = utility::clamp<int32_t>(acc, 0, 255);
+    // Quantize down
+    acc = validation::quantize_down_scale_by_fixedpoint(acc, output_multiplier, output_shift, output_offset, 0, 255);
 
     // Store the result
     *out_ptr = acc;
@@ -182,4 +181,4 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &wei
 } // namespace convolution_3d
 } // namespace test
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__ */
+#endif /* ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H */
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index 608093d381..0c7e92b8d0 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -197,7 +197,7 @@ SimpleTensor<T> depthwise_convolution_quantized(const SimpleTensor<T> &src, cons
                 int         output_shift      = 0;
                 const float weights_scale     = (is_quantized_per_channel) ? weights_scale_vec[out_z] : weights_scale_vec[0];
                 const float multiplier        = input_scale * weights_scale / output_scale;
-                arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+                arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
                 for(int y = minimum_y; y <= minimum_y + maximum_y; y += conv_info.stride().second)
                 {
@@ -220,9 +220,8 @@ SimpleTensor<T> depthwise_convolution_quantized(const SimpleTensor<T> &src, cons
                             }
                         }
                         val += bias_val;
-                        val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
-                        val += output_offset;
-                        val = utility::clamp<int32_t>(val, 0, 255);
+                        // Quantize down
+                        val = quantize_down_scale_by_fixedpoint(val, output_multiplier, output_shift, output_offset, 0, 255);
 
                         // Store the result
                         dst[out_pos++] = val;
diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp
index cd84b9cfd1..261c6453b9 100644
--- a/tests/validation/reference/FullyConnectedLayer.cpp
+++ b/tests/validation/reference/FullyConnectedLayer.cpp
@@ -81,7 +81,7 @@ void vector_matrix_multiply(const SimpleTensor<T> &src, const SimpleTensor<T> &w
     int         output_multiplier = 0;
     int         output_shift      = 0;
     const float multiplier        = input_scale * weights_scale / output_scale;
-    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     for(int y = 0; y < rows_weights; ++y)
     {
@@ -96,9 +96,8 @@ void vector_matrix_multiply(const SimpleTensor<T> &src, const SimpleTensor<T> &w
         // Accumulate the bias
         acc += bias_ptr[y];
 
-        acc = asymm_rounding_divide_by_pow2(asymm_int_mult(acc, output_multiplier), output_shift);
-        acc += output_offset;
-        acc = utility::clamp<int32_t>(acc, 0, 255);
+        // Quantize down
+        acc = quantize_down_scale_by_fixedpoint(acc, output_multiplier, output_shift, output_offset, 0, 255);
 
         // Store the result
         dst_ptr[y] = static_cast<T>(acc);
diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h
index b7b69d588a..444696c93a 100644
--- a/tests/validation/reference/UtilsQuantizedAsymm.h
+++ b/tests/validation/reference/UtilsQuantizedAsymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H__
-#define __ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H__
+#ifndef ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H
+#define ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H
 
 #include <cstdint>
 
@@ -51,7 +51,25 @@ inline int32_t asymm_int_mult(int32_t a, int32_t b)
     int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
     return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
 }
+
+/** Quantize down the input value in range [min, max]. */
+inline int32_t quantize_down_scale_by_fixedpoint(int32_t val, int32_t result_mult_int, int32_t result_shift,
+                                                 int32_t result_offset_after_shift, int32_t min, int32_t max)
+{
+    int32_t res = 0;
+    if(result_shift < 0)
+    {
+        res = asymm_int_mult(val * (1 << (-result_shift)), result_mult_int);
+    }
+    else
+    {
+        res = asymm_rounding_divide_by_pow2(asymm_int_mult(val, result_mult_int), result_shift);
+    }
+    res += result_offset_after_shift;
+    res = utility::clamp<int32_t>(res, min, max);
+    return res;
+}
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H__ */
+#endif /* ARM_COMPUTE_TEST_VALIDATION_UTILS_QUANTIZED_ASYMM_H */
-- 
cgit v1.2.1