Add support for int8 CpuPool3d

- Add implementation for the CPU pooling 3d layer. - NDHWC data layout support. - Support QASYMM8/QASYMM8_SIGNED. - Add Pooling helper file for Pool3d/2d common functions. Resolves COMPMID-4668 Change-Id: Iadf042036b076099c2353d6e2fe9fc623bc263d8 Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7387 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Adnan AlSinan <adnan.alsinan@arm.com> 2022-04-06 16:19:31 +0100
committer: Adnan AlSinan <adnan.alsinan@arm.com> 2022-04-13 11:55:45 +0000
commit: 9104cd559222b98f2b21f14d4fd561ed4a4e9bc2 (patch)
tree: 628b30de762e8e1dc3d21c5dcb76a92212fa00af
parent: 16c5697085c256c19fb8ba4bef6188d61f30a88b (diff)
download: ComputeLibrary-9104cd559222b98f2b21f14d4fd561ed4a4e9bc2.tar.gz
20 files changed, 855 insertions, 222 deletions
diff --git a/Android.bp b/Android.bp
index a440e79ffd..691e46e5ac 100644
--- a/Android.bp
+++ b/Android.bp
@@ -512,6 +512,8 @@ cc_library_static {
         "src/cpu/kernels/pool3d/neon/fp16.cpp",
         "src/cpu/kernels/pool3d/neon/fp32.cpp",
         "src/cpu/kernels/pool3d/neon/impl.cpp",
+        "src/cpu/kernels/pool3d/neon/qasymm8.cpp",
+        "src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/range/generic/neon/fp16.cpp",
         "src/cpu/kernels/range/generic/neon/fp32.cpp",
         "src/cpu/kernels/range/generic/neon/impl.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
index 7b31f916f6..4c5eb58e05 100644
--- a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -64,10 +64,12 @@ public:
      * |:--------------|:--------------|
      * |F16            |F16            |
      * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
      *
      * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
      *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out] output    Destination tensor.
      * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
@@ -75,7 +77,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer
      *
      *
-     * @param[in] input     Source tensor info. Data types supported: F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[in] output    Destination tensor info.
      * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
      *
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index ee337d46ea..c0888f1775 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -2316,6 +2316,8 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><th>src<th>dst
     <tr><td>F16<td>F16
     <tr><td>F32<td>F32
+    <tr><td>QASYMM8<td>QASYMM8
+    <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
     </table>
 <tr>
   <td>CLPooling3dLayer
diff --git a/filelist.json b/filelist.json
index 44e71c7e69..d02a6fc0c1 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1803,9 +1803,11 @@
             "src/runtime/NEON/functions/NEPooling3dLayer.cpp"
           ],
           "neon": {
-            "common":[ "src/cpu/kernels/pool3d/neon/impl.cpp" ],
-            "fp16":  [ "src/cpu/kernels/pool3d/neon/fp16.cpp" ],
-            "fp32":  [ "src/cpu/kernels/pool3d/neon/fp32.cpp" ]
+            "common":         [ "src/cpu/kernels/pool3d/neon/impl.cpp" ],
+            "fp16":           [ "src/cpu/kernels/pool3d/neon/fp16.cpp" ],
+            "fp32":           [ "src/cpu/kernels/pool3d/neon/fp32.cpp" ],
+            "qasymm8":        [ "src/cpu/kernels/pool3d/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp" ]
           }
         }
       },
diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
new file mode 100644
index 0000000000..079629ee6a
--- /dev/null
+++ b/src/core/helpers/PoolingHelpers.h
@@ -0,0 +1,202 @@
+/*
+* Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_POOLINGHELPERS_H
+#define SRC_CORE_HELPERS_POOLINGHELPERS_H
+
+#include "src/core/NEON/NEAsymm.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+
+inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
+                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+{
+    // Based on NDHWC
+    int start_x = id[1] * stride_x - pad_x;
+    int start_y = id[2] * stride_y - pad_y;
+    int start_z = id[3] * stride_z - pad_z;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+    const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
+    if(exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+        start_z = std::max(0, start_z);
+    }
+    return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
+}
+
+inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    int start_x = id[idx_width] * stride_x - pad_x;
+    int start_y = id[idx_height] * stride_y - pad_y;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+    if(exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+    }
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
+quantize(float val, const UniformQuantizationInfo &info)
+{
+    return quantize_qasymm8_signed(val, info);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
+quantize(float val, const UniformQuantizationInfo &info)
+{
+    return quantize_qasymm8(val, info);
+}
+
+template <typename T>
+inline T vcvtq_q32_f32(float32x4_t values);
+
+template <>
+inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
+{
+    return vcvtq_u32_f32(values);
+}
+
+template <>
+inline int32x4_t vcvtq_q32_f32(float32x4_t values)
+{
+    return vcvtq_s32_f32(values);
+}
+
+template <typename T>
+inline float32x4_t vcvtq_f32_q32(T values);
+
+template <>
+inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
+{
+    return vcvtq_f32_u32(values);
+}
+
+template <>
+inline float32x4_t vcvtq_f32_q32(int32x4_t values)
+{
+    return vcvtq_f32_s32(values);
+}
+
+template <typename Tout>
+inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
+
+template <>
+inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+{
+    const float new_scale = quant_rescale / scale_pooling;
+    return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
+}
+
+template <>
+inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+{
+    const float new_scale = quant_rescale / scale_pooling;
+    return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
+}
+
+template <typename Tin, typename Tout>
+inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
+
+template <>
+inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x4_t acc =
+    {
+        {
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
+        }
+    };
+    return vquantize(acc, requant_qinfo);
+}
+
+template <>
+inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x4_t acc =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
+        }
+    };
+    return vquantize_signed(acc, requant_qinfo);
+}
+
+template <typename T>
+inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
+
+template <>
+inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x2_t acc =
+    {
+        {
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
+        }
+    };
+    return vquantize(acc, requant_qinfo);
+}
+
+template <>
+inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
+{
+    const float32x4x2_t acc =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
+        }
+    };
+    return vquantize_signed(acc, requant_qinfo);
+}
+
+} // namespace
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
+
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
index 3321967d2f..1305f7c5e8 100644
--- a/src/cpu/kernels/CpuPool3dKernel.cpp
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -44,11 +44,20 @@ using namespace misc::shape_calculator;
 static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
 {
     {
+        "neon_qu8_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)
+    },
+    {
+        "neon_qs8_ndhwc_poolMxNxD",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)
+    },
+    {
         "neon_fp16_ndhwc_poolMxNxD",
         [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
     },
-
     {
         "neon_fp32_ndhwc_poolMxNxD",
         [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
@@ -61,7 +70,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
+                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+                                    "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto data_layout = src->data_layout();
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
index f762cfca9a..437f2af7e4 100644
--- a/src/cpu/kernels/CpuPool3dKernel.h
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -51,8 +51,10 @@ public:
      * |:--------------|:--------------|
      * |F16            |F16            |
      * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
      *
-     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
      * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 72f63af3be..13e21b1e70 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -199,8 +199,8 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
             if(pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                               pool_stride_y);
                 const float16x8_t scale_v = vdupq_n_f16(scale);
 
                 // Perform pooling
@@ -260,8 +260,8 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
             if(pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
+                const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                   pool_stride_y);
 
                 for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index e4261f746d..1ed199be8d 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -193,8 +193,8 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
                 if(pool_info.pool_type != PoolingType::MAX)
                 {
                     // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
+                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                   pool_stride_y);
                     const float32x4_t scale_v = vdupq_n_f32(scale);
 
                     // Perform pooling
@@ -258,8 +258,8 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
                 if(pool_info.pool_type != PoolingType::MAX)
                 {
                     // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
+                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                   pool_stride_y);
 
                     for(int y = pool_start_y; y < pool_end_y; ++y)
                     {
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index 10cbfc56a1..77f63c6f77 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -124,7 +124,7 @@ void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                     pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
@@ -288,7 +288,7 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
 
             if(pool_info.pool_type != PoolingType::MAX)
             {
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                         pool_stride_y);
                 const float16x4_t scale_v = vdup_n_f16(scale);
 
@@ -343,7 +343,7 @@ void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1,
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                         pool_stride_y);
 
             // Perform pooling
@@ -430,7 +430,7 @@ void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1,
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
+            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
                                                     pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
@@ -538,7 +538,7 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
             if(pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                                   pool_stride_y);
                 const float32x2_t scale_v = vdup_n_f32(scale);
 
@@ -618,7 +618,7 @@ void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                               pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
@@ -687,7 +687,7 @@ void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         if(pool_info.pool_type != PoolingType::MAX)
         {
             // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
                                               pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index 386e043984..a2cd3991be 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/PoolingHelpers.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -37,148 +38,6 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8(val, info);
-}
-
-template <typename T>
-inline T vcvtq_q32_f32(float32x4_t values);
-
-template <>
-inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_u32_f32(values);
-}
-
-template <>
-inline int32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_s32_f32(values);
-}
-
-template <typename T>
-inline float32x4_t vcvtq_f32_q32(T values);
-
-template <>
-inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
-{
-    return vcvtq_f32_u32(values);
-}
-
-template <>
-inline float32x4_t vcvtq_f32_q32(int32x4_t values)
-{
-    return vcvtq_f32_s32(values);
-}
-
-template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
-
-template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <typename Tin, typename Tout>
-inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-template <typename T>
-inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int start_x = id[idx_width] * stride_x - pad_x;
-    int start_y = id[idx_height] * stride_y - pad_y;
-
-    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
-    {
-        start_x = std::max(0, start_x);
-        start_y = std::max(0, start_y);
-    }
-    return 1.f / ((end_y - start_y) * (end_x - start_x));
-}
-
-template <typename T>
 void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
@@ -250,8 +109,8 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
                 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
 
                 // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                               pool_stride_y);
 
                 // Perform pooling
                 for(int y = pool_start_y; y < pool_end_y; ++y)
@@ -352,8 +211,8 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
                 q32_t res = static_cast<q32_t>(0.f);
 
                 // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
+                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                               pool_stride_y);
 
                 // Perform pooling
                 for(int y = pool_start_y; y < pool_end_y; ++y)
@@ -531,8 +390,8 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     Iterator out(dst0, window);
 
     /** SIMD vector types */
-    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
     using q16_t     = typename wrapper::traits::promote_t<T>;
     using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
@@ -867,8 +726,8 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *
             q32_t sres = 0;
 
             // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
+            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                           pool_stride_y);
 
             // Perform pooling
             for(int y = 0; y < pool_size_y; ++y)
diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h
index ece780eb0b..3426360f93 100644
--- a/src/cpu/kernels/pool3d/list.h
+++ b/src/cpu/kernels/pool3d/list.h
@@ -31,6 +31,8 @@ namespace cpu
 #define DECLARE_POOLING_KERNEL(func_name) \
     void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
 
+DECLARE_POOLING_KERNEL(neon_q8_pool3d);
+DECLARE_POOLING_KERNEL(neon_q8_signed_pool3d);
 DECLARE_POOLING_KERNEL(neon_fp16_pool3d);
 DECLARE_POOLING_KERNEL(neon_fp32_pool3d);
 
diff --git a/src/cpu/kernels/pool3d/neon/impl.cpp b/src/cpu/kernels/pool3d/neon/impl.cpp
index bb3999b104..2b089f3079 100644
--- a/src/cpu/kernels/pool3d/neon/impl.cpp
+++ b/src/cpu/kernels/pool3d/neon/impl.cpp
@@ -22,11 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/pool3d/neon/quantized.h"
 
 #include "src/cpu/kernels/pool3d/neon/impl.h"
 
@@ -36,27 +35,6 @@ namespace cpu
 {
 namespace
 {
-inline float calculate_avg_scale(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
-                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
-{
-    // Based on NDHWC
-    int start_x = id[1] * stride_x - pad_x;
-    int start_y = id[2] * stride_y - pad_y;
-    int start_z = id[3] * stride_z - pad_z;
-
-    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
-    if(exclude_padding)
-    {
-        start_x = std::max(0, start_x);
-        start_y = std::max(0, start_y);
-        start_z = std::max(0, start_z);
-    }
-    return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
-}
-
-
 template <typename T>
 void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
                                     const int window_start_x, const int window_end_x, const int window_step_x)
@@ -227,9 +205,9 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
         const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
         // Calculate scale
-        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                pool_pad_top, pool_pad_front, pool_stride_x,
-                                                pool_stride_y, pool_stride_z);
+        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                       pool_pad_top, pool_pad_front, pool_stride_x,
+                                                       pool_stride_y, pool_stride_z);
         const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
         int x_off = window_start_x;
@@ -354,9 +332,9 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL
         const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
         // Calculate scale
-        const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                pool_pad_top, pool_pad_front, pool_stride_x,
-                                                pool_stride_y, pool_stride_z);
+        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                       pool_pad_top, pool_pad_front, pool_stride_x,
+                                                       pool_stride_y, pool_stride_z);
 
         int x_off = window_start_x;
 
@@ -452,9 +430,33 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     }
 }
 
+template <typename T>
+void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    constexpr int window_step_x = 16;
+    Window        window_out    = window;
+
+    // Needed to handle loop left-over
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    switch(pool_info.pool_type)
+    {
+        case PoolingType::MAX:
+            max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        case PoolingType::AVG:
+            avg_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Pool operation not supported");
+    }
+}
+
 template void poolingMxNxD_fp_neon_ndhwc<float>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 template void poolingMxNxD_fp_neon_ndhwc<float16_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+template void poolingMxNxD_q8_neon_ndhwc<uint8_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+template void poolingMxNxD_q8_neon_ndhwc<int8_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 829a9bd192..7ad8c8eb05 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -37,6 +37,8 @@ namespace cpu
 template <typename T>
 void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 
+template <typename T>
+void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H
diff --git a/src/cpu/kernels/pool3d/neon/qasymm8.cpp b/src/cpu/kernels/pool3d/neon/qasymm8.cpp
new file mode 100644
index 0000000000..650a815e76
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/qasymm8.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_q8_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_q8_neon_ndhwc<uint8_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..374b2435ea
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_q8_signed_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+    return poolingMxNxD_q8_neon_ndhwc<int8_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
new file mode 100644
index 0000000000..ac14f5eafa
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+#define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/PoolingHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+                                    const int window_step_x)
+
+{
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    using q16_t   = typename wrapper::traits::promote_t<T>;
+    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
+    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
+    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_c = src->info()->dimension(0);
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    const int window_end_x   = input_dim_c;
+    const int window_start_x = 0;
+
+    Iterator out(dst0, window_out);
+
+    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
+    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
+
+    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
+    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
+    // With a requantization performed in a single step there won't be uncertainties introduced
+    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        // Calculate scale
+        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+                                                       pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                    }
+                }
+            }
+
+            if(src_qinfo != dst_qinfo)
+            {
+                const float32x4x4_t vres =
+                {
+                    {
+                        vcvtq_f32_q32(vres1),
+                        vcvtq_f32_q32(vres2),
+                        vcvtq_f32_q32(vres3),
+                        vcvtq_f32_q32(vres4),
+                    }
+                };
+                const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+            }
+            else
+            {
+                const float32x4_t scale_v = vdupq_n_f32(scale);
+                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+            }
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            q32_t res = static_cast<q32_t>(0.f);
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                        res += data;
+                    }
+                }
+            }
+
+            if(src_qinfo != dst_qinfo)
+            {
+                const float res_f           = static_cast<float>(res);
+                const float new_scale       = quant_rescale / scale;
+                const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+            }
+            else
+            {
+                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        }
+    },
+    out);
+}
+
+template <typename T>
+void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+                                    const int window_step_x)
+
+{
+    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
+    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+
+    const int window_half_step_x = window_step_x / 2;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top   = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_left  = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+    const int input_dim_c = src->info()->dimension(0);
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    const int window_end_x   = input_dim_c;
+    const int window_start_x = 0;
+
+    Iterator out(dst0, window_out);
+
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
+
+    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // Computing the theoretical input starting/ending points
+        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+        const int pool_start_x = std::max(0, -in_idx_width);
+        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+        const int pool_start_y = std::max(0, -in_idx_height);
+        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+        const int pool_start_z = std::max(0, -in_idx_depth);
+        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+        int x_off = window_start_x;
+
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+        {
+            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        vres = wrapper::vmax(vres, data);
+                    }
+                }
+            }
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
+                            requant_qinfo) :
+                            vres);
+        }
+
+        // Leftovers using half the window step
+        for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+        {
+            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+            // Perform pooling
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        vres = wrapper::vmax(vres, data);
+                    }
+                }
+            }
+
+            // Store result
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                            (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            T res = std::numeric_limits<T>::min();
+
+            for(int z = pool_start_z; z < pool_end_z; ++z)
+            {
+                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                        res = std::max(res, data);
+                    }
+                }
+            }
+
+            // Store result
+            if(src_qinfo != dst_qinfo)
+            {
+                const float res_f                           = static_cast<float>(res);
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+            }
+            else
+            {
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        }
+    },
+    out);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+\ No newline at end of file
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index fc73cf0e0e..8a73f8a0af 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h
@@ -47,7 +47,7 @@ public:
     /** Set the src and dst tensors.
      *
      *
-     * @param[in]  src       Source tensor info. Data types supported: F16/F32.
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
      * @param[out] dst       Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp
index ae5ca466b3..07054462f5 100644
--- a/tests/validation/NEON/Pooling3dLayer.cpp
+++ b/tests/validation/NEON/Pooling3dLayer.cpp
@@ -55,12 +55,43 @@ const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datase
                                                           framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
                                                   framework::dataset::make("ExcludePadding", { true, false }));
 
+const auto Pooling3dLayerDatasetQASYMM8Small = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+const auto Pooling3dLayerDatasetQASYMM8Large = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
 using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
 
 constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
-#endif   /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);     /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+#endif                                                       /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
+
+const auto qasymm8_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, 10) });
+const auto qasymm8_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, 10), // Same qinfo
+    QuantizationInfo(.1f, 5),  // Multiplier <= 1
+    QuantizationInfo(2.f, 3)   // Multiplier > 1
+});
+
+const auto qasymm8_signed_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, -10) });
+const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, -10), // Same qinfo
+    QuantizationInfo(.1f, -5),  // Multiplier <= 1
+    QuantizationInfo(2.f, -3)   // Multiplier > 1
+});
+
 } //namespace
 
 TEST_SUITE(NEON)
@@ -280,8 +311,49 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::Datas
 TEST_SUITE_END() // GlobalPooling
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 TEST_SUITE_END() // Float
+TEST_SUITE(Quantized)
+
+template <typename T>
+using NEPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Large,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                      combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                      qasymm8_signed_in_qinfo_dataset),
+                                                                                                                      qasymm8_signed_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // Pooling3dLayer
 TEST_SUITE_END() // NEON
 } // namespace validation
diff --git a/tests/validation/fixtures/Pooling3dLayerFixture.h b/tests/validation/fixtures/Pooling3dLayerFixture.h
index c1b3519e80..563f1dcced 100644
--- a/tests/validation/fixtures/Pooling3dLayerFixture.h
+++ b/tests/validation/fixtures/Pooling3dLayerFixture.h
@@ -46,10 +46,10 @@ class Pooling3dLayerValidationGenericFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, Pooling3dLayerInfo pool_info, DataType data_type)
+    void setup(TensorShape shape, Pooling3dLayerInfo pool_info, DataType data_type, QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo())
     {
-        _target    = compute_target(shape, pool_info, data_type);
-        _reference = compute_reference(shape, pool_info, data_type);
+        _target    = compute_target(shape, pool_info, data_type, input_qinfo, output_qinfo);
+        _reference = compute_reference(shape, pool_info, data_type, input_qinfo, output_qinfo);
     }
 
 protected:
@@ -68,17 +68,17 @@ protected:
         }
         else // data type is quantized_asymmetric
         {
-            ARM_COMPUTE_ERROR("Passed Type Not Supported");
+            library->fill_tensor_uniform(tensor, 0);
         }
     }
 
     TensorType compute_target(TensorShape shape, Pooling3dLayerInfo info,
-                              DataType data_type)
+                              DataType data_type, QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
     {
         // Create tensors
-        TensorType        src       = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), DataLayout::NDHWC);
+        TensorType        src       = create_tensor<TensorType>(shape, data_type, 1, input_qinfo, DataLayout::NDHWC);
         const TensorShape dst_shape = misc::shape_calculator::compute_pool3d_shape((src.info()->tensor_shape()), info);
-        TensorType        dst       = create_tensor<TensorType>(dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NDHWC);
+        TensorType        dst       = create_tensor<TensorType>(dst_shape, data_type, 1, output_qinfo, DataLayout::NDHWC);
 
         // Create and configure function
         FunctionType pool_layer;
@@ -103,17 +103,17 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(TensorShape shape, Pooling3dLayerInfo info, DataType data_type)
+    SimpleTensor<T> compute_reference(TensorShape shape, Pooling3dLayerInfo info, DataType data_type, QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
     {
         // Create reference
-        SimpleTensor<T> src(shape, data_type, 1, QuantizationInfo(), DataLayout::NDHWC);
+        SimpleTensor<T> src(shape, data_type, 1, input_qinfo, DataLayout::NDHWC);
         // Fill reference
         fill(src);
-        return reference::pooling_3d_layer<T>(src, info);
+        return reference::pooling_3d_layer<T>(src, info, output_qinfo);
     }
 
-    TensorType             _target{};
-    SimpleTensor<T>        _reference{};
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -129,6 +129,19 @@ public:
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Pooling3dLayerValidationQuantizedFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, Size3D pool_size, Size3D stride, Padding3D padding, bool exclude_padding, DataType data_type,
+               QuantizationInfo input_qinfo = QuantizationInfo(), QuantizationInfo output_qinfo = QuantizationInfo())
+    {
+        Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, Pooling3dLayerInfo(pool_type, pool_size, stride, padding, exclude_padding),
+                                                                                                 data_type, input_qinfo, output_qinfo);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class Pooling3dLayerGlobalValidationFixture : public Pooling3dLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
author	Adnan AlSinan <adnan.alsinan@arm.com>	2022-04-06 16:19:31 +0100
committer	Adnan AlSinan <adnan.alsinan@arm.com>	2022-04-13 11:55:45 +0000
commit	9104cd559222b98f2b21f14d4fd561ed4a4e9bc2 (patch)
tree	628b30de762e8e1dc3d21c5dcb76a92212fa00af
parent	16c5697085c256c19fb8ba4bef6188d61f30a88b (diff)
download	ComputeLibrary-9104cd559222b98f2b21f14d4fd561ed4a4e9bc2.tar.gz