From 51847d5dd9cad6bc81673642a01fd531def44311 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Tue, 19 Oct 2021 15:45:57 +0100
Subject: Implement CLDirectConv3DKernel - uint8/int8

Resolve COMPMID-4663

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: I5c3c1cffed5385c06b789543318f7f4d6096987e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6468
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
---
 arm_compute/runtime/CL/functions/CLConv3D.h        |   2 +
 docs/user_guide/operator_list.dox                  |   2 +
 .../CL/cl_kernels/nhwc/direct_convolution3d.cl     |  52 +++++++--
 src/gpu/cl/kernels/ClDirectConv3dKernel.cpp        |  52 ++++++++-
 src/gpu/cl/kernels/ClDirectConv3dKernel.h          |   2 +
 src/gpu/cl/operators/ClDirectConv3d.h              |   2 +
 tests/validation/CL/Convolution3D.cpp              | 122 ++++++++++++++++++++-
 .../fixtures/DirectConvolution3DFixture.h          |  51 ++++++---
 tests/validation/reference/Conv3D.cpp              | 110 ++++++++++++++++---
 tests/validation/reference/Conv3D.h                |   4 +-
 10 files changed, 353 insertions(+), 46 deletions(-)
diff --git a/arm_compute/runtime/CL/functions/CLConv3D.h b/arm_compute/runtime/CL/functions/CLConv3D.h
index 241481b8ba..5728fe79d8 100644
--- a/arm_compute/runtime/CL/functions/CLConv3D.h
+++ b/arm_compute/runtime/CL/functions/CLConv3D.h
@@ -65,6 +65,8 @@ public:
      * |:--------------|:--------------|:------|:--------------|
      * |F16            |F16            |F16    |F16            |
      * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  src             Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 1d06a394a9..55bfe38f55 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -629,6 +629,8 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><th>src0<th>src1<th>src2<th>dst
     <tr><td>F16<td>F16<td>F16<td>F16
     <tr><td>F32<td>F32<td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8<td>S32<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>S32<td>QASYMM8_SIGNED
     </table>
 <tr>
   <td rowspan="2">Copy
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
index d11be5bbb3..587f3984ab 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
@@ -29,7 +29,7 @@
 /** OpenCL kernel to compute the direct convolution 3d.
  *
  * @note Data layout supported: NDHWC
- * @note Data type supported: F32/F16
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
  * @note The accumulation data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
  * @note The convolution padding (left, top and front) must be passed at compile time using -DPAD_LEFT, -DPAD_TOP and -DPAD_FRONT (e.g. -DPAD_LEFT=2, -DPAD_TOP=2, -DPAD_FRONT=2)
  * @note The convolution strides must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y and -DSTRIDE_Z (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2, -DSTRIDE_Z=2)
@@ -44,12 +44,22 @@
  * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
  * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
  * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 1, 2, 3, 4, 5, .... n
  *  - N0 = 2, 3, 4, 8, 16
  *  - K0 = 2, 3, 4, 8, 16
  *
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ *  - -DIS_QUANTIZED
+ *  - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ *  - The destination quantization shift e.g. -DDST_SHIFT=4
+ *  - The destination offset e.g. -DDST_OFFSET=4
+ *  - The source offset e.g. -DSRC_OFFSET=4
+ *  - The weights offset e.g. -DWEI_OFFSET=4
+ *  - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time along with its tensor type by using -DBIA_DATA_TYPE (e.g. -DBIA_DATA_TYPE=int).
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -110,6 +120,13 @@ __kernel void direct_convolution3d_ndhwc(
 #define _IDST_CHANNELS DST_CHANNELS
 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH)
 
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
     const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
     const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT x DEPTH
     const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
@@ -153,7 +170,7 @@ __kernel void direct_convolution3d_ndhwc(
 
             LOOP_UNROLLING(int, i, 0, 1, M0,
             {
-                a[i].v = (DATA_TYPE)0;
+                a[i].v = ZERO_VALUE;
             })
 
             // Load tile from the src tensor
@@ -175,6 +192,10 @@ __kernel void direct_convolution3d_ndhwc(
             // Compute the matrix multiplication between two tiles
             T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
 
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
             ck += K0;
         }
 
@@ -187,7 +208,7 @@ __kernel void direct_convolution3d_ndhwc(
 
             LOOP_UNROLLING(int, i, 0, 1, M0,
             {
-                a[i].v = (DATA_TYPE)0;
+                a[i].v = ZERO_VALUE;
             })
 
             // Load tile from the src tensor
@@ -206,22 +227,30 @@ __kernel void direct_convolution3d_ndhwc(
             // // Compute the matrix multiplication between two tiles
             T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
 
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
             ++ck;
         }
 #endif // ((_ISRC_CHANNELS % K0) != 0)
     }
 
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
 #if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, bias0);
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
 
     if((cout + N0) <= _IDST_CHANNELS)
     {
-        bias0[0].v = VLOAD(N0)(0, (__global DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(DATA_TYPE)));
+        bias0[0].v = VLOAD(N0)(0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
     }
     else
     {
         VLOAD_PARTIAL(N0, PARTIAL_N0)
-        (bias0[0].v, 0, (__global DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(DATA_TYPE)));
+        (bias0[0].v, 0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout * sizeof(BIA_DATA_TYPE)));
     }
 
     // c = c + bias[broadcasted]
@@ -238,8 +267,15 @@ __kernel void direct_convolution3d_ndhwc(
         dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH *_IDST_HEIGHT * _IDST_DEPTH);
     })
 
+#if defined(IS_QUANTIZED)
+    TILE(DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
     bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
     // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER, dst, cout, dst_stride_y, x_cond, c, dst_indirect_y);
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
 }
\ No newline at end of file
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
index 88e73dc72a..27afb7e190 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -44,7 +45,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one");
@@ -56,7 +57,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
 
     if(src2 != nullptr)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+        if(is_data_type_quantized(src0->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+        }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
@@ -114,7 +122,6 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     CLBuildOptions build_options;
     build_options.add_option("-cl-fast-relaxed-math");
     build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_options.add_option("-DACC_DATA_TYPE=float");
     build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
     build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
     build_options.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src_depth));
@@ -136,7 +143,44 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_options.add_option_if(src2 != nullptr, std::string("-DHAS_BIAS"));
+
+    if(src2 != nullptr)
+    {
+        build_options.add_option(std::string("-DHAS_BIAS"));
+        build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
+    }
+
+    if(is_data_type_quantized(data_type))
+    {
+        const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+        PixelValue zero_value = PixelValue(0, src0->data_type(), src0->quantization_info());
+        int        zero_value_s32;
+        zero_value.get(zero_value_s32);
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        build_options.add_option("-DIS_QUANTIZED");
+        build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+        build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+        build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+        build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+        build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+        build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+    }
+    else
+    {
+        build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::F32));
+        build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+        build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+        build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+        build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+    }
 
     std::string kernel_name = "direct_convolution3d_ndhwc";
     _kernel                 = create_kernel(compile_context, kernel_name, build_options.options());
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
index 485c900826..de4f0ce216 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
@@ -59,6 +59,8 @@ public:
      * |:--------------|:--------------|:------|:--------------|
      * |F16            |F16            |F16    |F16            |
      * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  src0            Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
index d8ffefc450..fa58b5aedd 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.h
+++ b/src/gpu/cl/operators/ClDirectConv3d.h
@@ -55,6 +55,8 @@ public:
      * |:--------------|:--------------|:------|:--------------|
      * |F16            |F16            |F16    |F16            |
      * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  src0            Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
diff --git a/tests/validation/CL/Convolution3D.cpp b/tests/validation/CL/Convolution3D.cpp
index 75e2e99b03..381aacc465 100644
--- a/tests/validation/CL/Convolution3D.cpp
+++ b/tests/validation/CL/Convolution3D.cpp
@@ -38,10 +38,11 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<half>  tolerance_fp16(half(0.2));  /**< Tolerance for floating point tests */
-RelativeTolerance<float> tolerance_fp32(0.05f);      /**< Tolerance for floating point tests */
-constexpr float          abs_tolerance_f32(0.0001f); /**< Absolute tolerance for FP32 tests*/
-constexpr float          tolerance_num = 0.07f;      /**< Tolerance number */
+RelativeTolerance<half>              tolerance_fp16(half(0.2));  /**< Tolerance for floating point tests */
+RelativeTolerance<float>             tolerance_fp32(0.05f);      /**< Tolerance for floating point tests */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);       /**< Tolerance for quantized tests */
+constexpr float                      abs_tolerance_f32(0.0001f); /**< Absolute tolerance for FP32 tests*/
+constexpr float                      tolerance_num = 0.07f;      /**< Tolerance number */
 } // namespace
 
 TEST_SUITE(CL)
@@ -165,6 +166,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
 
 template <typename T>
 using CLDirectConvolution3DFixture = DirectConvolution3DValidationFixture<CLTensor, CLAccessor, CLConv3D, T>;
+template <typename T>
+using CLDirectConvolution3DQuantizedFixture = DirectConvolution3DValidationQuantizedFixture<CLTensor, CLAccessor, CLConv3D, T>;
 
 TEST_SUITE(NDHWC)
 TEST_SUITE(FP16)
@@ -266,6 +269,117 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolution3DFixture<float>, framework:
 // clang-format on
 // *INDENT-ON*
 TEST_SUITE_END() // FP32
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolution3DQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolution3DQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(400U, 400U, 200U, 11U) }),
+                                                                                                                   framework::dataset::make("StrideX", { 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 1 })),
+                                                                                                       framework::dataset::make("PadX", { 1 })),
+                                                                                                   framework::dataset::make("PadY", { 1 })),
+                                                                                               framework::dataset::make("PadZ", { 1 })),
+                                                                                           framework::dataset::make("KernelWidth", { 9 })),
+                                                                                       framework::dataset::make("KernelHeight", { 9 })),
+                                                                                   framework::dataset::make("KernelDepth", { 9 })),
+                                                                               framework::dataset::make("NumKernels", { 300 })),
+                                                                           framework::dataset::make("HasBias", { true })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolution3DQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolution3DQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(400U, 400U, 200U, 11U) }),
+                                                                                                                   framework::dataset::make("StrideX", { 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 1 })),
+                                                                                                       framework::dataset::make("PadX", { 1 })),
+                                                                                                   framework::dataset::make("PadY", { 1 })),
+                                                                                               framework::dataset::make("PadZ", { 1 })),
+                                                                                           framework::dataset::make("KernelWidth", { 9 })),
+                                                                                       framework::dataset::make("KernelHeight", { 9 })),
+                                                                                   framework::dataset::make("KernelDepth", { 9 })),
+                                                                               framework::dataset::make("NumKernels", { 300 })),
+                                                                           framework::dataset::make("HasBias", { true })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
 TEST_SUITE_END() // NDHWC
 TEST_SUITE_END() // DirectConvolution3D
 TEST_SUITE_END() // CL
diff --git a/tests/validation/fixtures/DirectConvolution3DFixture.h b/tests/validation/fixtures/DirectConvolution3DFixture.h
index 3a675ac6d3..2250dcaeb0 100644
--- a/tests/validation/fixtures/DirectConvolution3DFixture.h
+++ b/tests/validation/fixtures/DirectConvolution3DFixture.h
@@ -40,19 +40,23 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class DirectConvolution3DValidationGenericFixture : public framework::Fixture
 {
 public:
+    using TBias = typename std::conditional < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T >::type;
+
     template <typename...>
     void setup(const TensorShape &input_shape, int stride_x, int stride_y, int stride_z, int pad_x, int pad_y, int pad_z, unsigned int kernel_width, int kernel_height, int kernel_depth,
-               unsigned int num_kernels, bool has_bias, const ActivationLayerInfo &act_info, const DataType &data_type, const DataLayout &data_layout)
+               unsigned int num_kernels, bool has_bias, const ActivationLayerInfo &act_info, const DataType &data_type, const DataLayout &data_layout,
+               const QuantizationInfo &src_qinfo = QuantizationInfo(), const QuantizationInfo &weights_qinfo = QuantizationInfo(), const QuantizationInfo &dst_qinfo = QuantizationInfo())
     {
         ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NDHWC);
 
         const TensorShape weights_shape(num_kernels, input_shape[0], kernel_width, kernel_height, kernel_depth);
         const TensorShape bias_shape(num_kernels);
+        const DataType    bias_data_type = is_data_type_quantized(data_type) ? DataType::S32 : data_type;
         const Conv3dInfo  conv3d_info(Size3D(stride_x, stride_y, stride_z), Padding3D(pad_x, pad_y, pad_z), act_info, Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false);
         const TensorShape output_shape = compute_conv3d_shape(input_shape, weights_shape, conv3d_info);
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, conv3d_info, has_bias, data_type, data_layout);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, conv3d_info, has_bias, data_type);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, conv3d_info, has_bias, data_type, bias_data_type, data_layout, src_qinfo, weights_qinfo, dst_qinfo);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, conv3d_info, has_bias, data_type, bias_data_type, src_qinfo, weights_qinfo, dst_qinfo);
     }
 
 protected:
@@ -79,13 +83,14 @@ protected:
     }
 
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const Conv3dInfo &conv3d_info,
-                              bool has_bias, const DataType &data_type, const DataLayout &data_layout)
+                              bool has_bias, const DataType &data_type, const DataType &bias_data_type, const DataLayout &data_layout, const QuantizationInfo &src_qinfo,
+                              const QuantizationInfo &weights_qinfo, const QuantizationInfo &dst_qinfo)
     {
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, QuantizationInfo(), data_layout);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, QuantizationInfo(), data_layout);
-        TensorType bias    = has_bias ? create_tensor<TensorType>(bias_shape, data_type, 1, QuantizationInfo()) : TensorType();
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, QuantizationInfo(), data_layout);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, src_qinfo, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, weights_qinfo, data_layout);
+        TensorType bias    = has_bias ? create_tensor<TensorType>(bias_shape, bias_data_type, 1, QuantizationInfo()) : TensorType();
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, dst_qinfo, data_layout);
 
         // Create and configure function
         FunctionType conv{};
@@ -122,14 +127,15 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const Conv3dInfo &conv3d_info,
-                                      bool has_bias, const DataType &data_type)
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape,
+                                      const Conv3dInfo &conv3d_info, bool has_bias, const DataType &data_type, const DataType &bias_data_type, const QuantizationInfo &src_qinfo,
+                                      const QuantizationInfo &weights_qinfo, const QuantizationInfo &dst_qinfo)
     {
         // Create reference
-        SimpleTensor<T> src{ input_shape, data_type };
-        SimpleTensor<T> weights{ weights_shape, data_type };
-        SimpleTensor<T> bias{ bias_shape, data_type };
-        SimpleTensor<T> dst{ output_shape, data_type };
+        SimpleTensor<T>     src{ input_shape, data_type, 1, src_qinfo };
+        SimpleTensor<T>     weights{ weights_shape, data_type, 1, weights_qinfo };
+        SimpleTensor<TBias> bias{ bias_shape, bias_data_type };
+        SimpleTensor<T>     dst{ output_shape, data_type, 1, dst_qinfo };
 
         // Fill reference
         fill(src, 0);
@@ -140,7 +146,7 @@ protected:
             fill(bias, 2);
         }
 
-        return reference::activation_layer(reference::conv3d<T>(src, weights, bias, dst, conv3d_info), conv3d_info.act_info);
+        return reference::activation_layer(reference::conv3d<T, TBias>(src, weights, bias, dst, conv3d_info), conv3d_info.act_info);
     }
 
     TensorType      _target{};
@@ -159,6 +165,21 @@ public:
                                                                                                       kernel_depth, num_kernels, has_bias, act_info, data_type, data_layout);
     }
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DirectConvolution3DValidationQuantizedFixture : public DirectConvolution3DValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int stride_z, int pad_x, int pad_y, int pad_z, unsigned int kernel_width, int kernel_height, int kernel_depth,
+               unsigned int num_kernels, bool has_bias, ActivationLayerInfo act_info, DataType data_type, DataLayout data_layout, QuantizationInfo src_qinfo, QuantizationInfo weights_qinfo,
+               QuantizationInfo dst_qinfo)
+    {
+        DirectConvolution3DValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, stride_z, pad_x, pad_y, pad_z, kernel_width, kernel_height,
+                                                                                                      kernel_depth, num_kernels, has_bias, act_info, data_type, data_layout, src_qinfo,
+                                                                                                      weights_qinfo, dst_qinfo);
+    }
+};
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/Conv3D.cpp b/tests/validation/reference/Conv3D.cpp
index ad61105b36..706059d1cb 100644
--- a/tests/validation/reference/Conv3D.cpp
+++ b/tests/validation/reference/Conv3D.cpp
@@ -22,7 +22,11 @@
  * SOFTWARE.
  */
 #include "Conv3D.h"
+
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "support/Requires.h"
+#include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
 // Source/Destination Tensor shape indices (N D H W C)
 constexpr unsigned int batch_dim   = 4u;
@@ -52,11 +56,14 @@ inline bool is_valid_pixel(int i, int min, int max)
 {
     return (i >= min && i < max);
 }
+
 // Evaluate the weights against an element in a given tensor.
-template <typename T>
-T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const Size3D &dilation, int batch,
-                   int z_start, int y_start, int x_start, int ch_out)
+template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
+T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const Size3D &dilation, int batch,
+                   int z_start, int y_start, int x_start, int ch_out, UniformQuantizationInfo oq_info)
 {
+    ARM_COMPUTE_UNUSED(oq_info);
+
     const unsigned int weights_width  = weights.shape()[weights_width_dim];
     const unsigned int weights_height = weights.shape()[weights_height_dim];
     const unsigned int weights_depth  = weights.shape()[weights_depth_dim];
@@ -101,12 +108,89 @@ T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, c
             }
         }
     }
-    return total;
+
+    const TB *b_ptr      = bias.data();
+    TB        bias_value = b_ptr[ch_out];
+
+    return total + bias_value;
 }
+
+template < typename T, typename TB, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const Size3D &dilation, int batch,
+                   int z_start, int y_start, int x_start, int ch_out, UniformQuantizationInfo oq_info)
+{
+    const unsigned int weights_width  = weights.shape()[weights_width_dim];
+    const unsigned int weights_height = weights.shape()[weights_height_dim];
+    const unsigned int weights_depth  = weights.shape()[weights_depth_dim];
+
+    const unsigned int src_channels = src.shape()[channel_dim];
+    const unsigned int src_width    = src.shape()[width_dim];
+    const unsigned int src_height   = src.shape()[height_dim];
+    const unsigned int src_depth    = src.shape()[depth_dim];
+
+    const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+    const int   input_offset   = -iq_info.offset;
+    const float input_scale    = iq_info.scale;
+    int         weights_offset = -wq_info.offset;
+    float       weights_scale  = wq_info.scale;
+    const int   output_offset  = oq_info.offset;
+    const float output_scale   = oq_info.scale;
+
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    const float multiplier        = input_scale * weights_scale / output_scale;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    int32_t total(0);
+    for(unsigned int weight_d = 0; weight_d < weights_depth; ++weight_d)
+    {
+        const int idx_z = z_start + dilation.depth * weight_d;
+        for(unsigned int weight_y = 0; weight_y < weights_height; ++weight_y)
+        {
+            const int idx_y = y_start + dilation.height * weight_y;
+            for(unsigned int weight_x = 0; weight_x < weights_width; ++weight_x)
+            {
+                const int idx_x = x_start + dilation.width * weight_x;
+
+                //Check if the point is within padding
+                const bool is_x_valid       = is_valid_pixel(idx_x, 0, src_width);
+                const bool is_y_valid       = is_valid_pixel(idx_y, 0, src_height);
+                const bool is_z_valid       = is_valid_pixel(idx_z, 0, src_depth);
+                const bool is_invalid_pixel = !(is_x_valid && is_y_valid && is_z_valid);
+                if(is_invalid_pixel)
+                {
+                    continue;
+                }
+
+                for(unsigned int ch_in = 0; ch_in < src_channels; ++ch_in)
+                {
+                    const T *in_ptr = src.data();
+                    const T *w_ptr  = weights.data();
+
+                    const int in_offset     = coord2index(src.shape(), Coordinates{ ch_in, idx_x, idx_y, idx_z, batch });
+                    const int weight_offset = coord2index(weights.shape(), Coordinates{ ch_out, ch_in, weight_x, weight_y, weight_d });
+                    T         input_value   = in_ptr[in_offset];
+                    T         weight_value  = w_ptr[weight_offset];
+                    total += ((input_value + input_offset) * (weight_value + weights_offset));
+                }
+            }
+        }
+    }
+
+    const TB *b_ptr      = bias.data();
+    TB        bias_value = b_ptr[ch_out];
+
+    total += bias_value;
+
+    return validation::quantize_down_scale_by_fixedpoint(total, output_multiplier, output_shift, output_offset,
+                                                         std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
 }
+} // namespace
 
-template <typename T>
-SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<T> &bias, SimpleTensor<T> &dst, const Conv3dInfo &conv3d_info)
+template <typename T, typename TB>
+SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const Conv3dInfo &conv3d_info)
 {
     // Compute reference
     const unsigned int batch_size     = src.shape()[batch_dim];
@@ -150,14 +234,10 @@ SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weight
                     const int x_start = (x_out * stride_x) - pad_left;
                     for(unsigned int ch_out = 0; ch_out < dst_channels; ++ch_out)
                     {
-                        T weighted_value = calculate_conv3d<T>(src, weights, conv3d_info.dilation, batch, z_start,
-                                                               y_start, x_start, ch_out);
-                        T        *out_ptr = dst.data();
-                        const T *b_ptr   = bias.data();
-                        T         bias_value(0);
+                        T *out_ptr = dst.data();
+
                         const int out_offset = coord2index(dst.shape(), Coordinates{ ch_out, x_out, y_out, z_out, batch });
-                        bias_value           = b_ptr[ch_out];
-                        out_ptr[out_offset]  = weighted_value + bias_value;
+                        out_ptr[out_offset]  = calculate_conv3d<T, TB>(src, weights, bias, conv3d_info.dilation, batch, z_start, y_start, x_start, ch_out, dst.quantization_info().uniform());
                     }
                 }
             }
@@ -170,6 +250,10 @@ template SimpleTensor<float> conv3d(const SimpleTensor<float> &src, const Simple
                                     const Conv3dInfo &conv3d_info);
 template SimpleTensor<half> conv3d(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, SimpleTensor<half> &dst,
                                    const Conv3dInfo &conv3d_info);
+template SimpleTensor<uint8_t> conv3d(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &dst,
+                                      const Conv3dInfo &conv3d_info);
+template SimpleTensor<int8_t> conv3d(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<int8_t> &dst,
+                                     const Conv3dInfo &conv3d_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Conv3D.h b/tests/validation/reference/Conv3D.h
index ade8a2c242..e3674f4bfb 100644
--- a/tests/validation/reference/Conv3D.h
+++ b/tests/validation/reference/Conv3D.h
@@ -37,8 +37,8 @@ namespace validation
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<T> &bias, SimpleTensor<T> &dst,
+template <typename T, typename TB>
+SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst,
                        const Conv3dInfo &conv3d_info);
 } // namespace reference
 } // namespace validation
-- 
cgit v1.2.1