From af6204c331eed7894ec4c5fd4e98ec22b6dac676 Mon Sep 17 00:00:00 2001
From: Anton Lokhmotov <psyhtest@users.noreply.github.com>
Date: Wed, 8 Nov 2017 09:34:19 +0000
Subject: COMPMID-661: Add avgpool-uint8 support. Optimize avgpool-fp32 for
 Bifrost. (#13)

Change-Id: I32ba6afbac6694ffa053dd16f03a1b3d14627a19
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94857
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/CL/kernels/CLPoolingLayerKernel.h |   4 +-
 arm_compute/core/Helpers.inl                       |   2 +-
 arm_compute/core/Types.h                           |   2 +-
 arm_compute/core/Utils.h                           |   2 +-
 arm_compute/runtime/CL/functions/CLPoolingLayer.h  |   4 +-
 src/core/CL/CLKernelLibrary.cpp                    |   7 +-
 src/core/CL/cl_kernels/pooling_layer.cl            | 101 +----------------
 src/core/CL/cl_kernels/pooling_layer_quantized.cl  | 121 +++++++++++++++++++++
 src/core/CL/kernels/CLActivationLayerKernel.cpp    |   4 +-
 .../CL/kernels/CLDirectConvolutionLayerKernel.cpp  |  11 +-
 src/core/CL/kernels/CLPoolingLayerKernel.cpp       | 100 +++++++++--------
 src/runtime/CL/functions/CLPoolingLayer.cpp        |  14 ++-
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |   2 +-
 tests/AssetsLibrary.h                              |   2 +
 tests/validation/CL/PoolingLayer.cpp               |  59 ++++++++--
 tests/validation/CPP/PoolingLayer.cpp              |  10 ++
 tests/validation/Helpers.cpp                       |  21 ++++
 tests/validation/Helpers.h                         |  17 +++
 tests/validation/fixtures/PoolingLayerFixture.h    |  68 +++++++++---
 19 files changed, 362 insertions(+), 189 deletions(-)
 create mode 100644 src/core/CL/cl_kernels/pooling_layer_quantized.cl

diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index a9159a4bb8..ffb5d79514 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -53,14 +53,14 @@ public:
      *
      * @note QS8 and QS16 are supported only for pool sizes 3, 5 and 7
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
      *
-     * @param[in] input     Source tensor info. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index acdb9567db..656956d00a 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -263,7 +263,7 @@ inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_
 
 inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
 {
-    if(info.quantization_info().empty() && (is_data_type_quantized_assymetric(info.data_type())))
+    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
     {
         info.set_quantization_info(quantization_info);
         return true;
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index e8be6127a8..eaff8fb709 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -120,7 +120,7 @@ struct QuantizationInfo
     float dequantize(uint8_t value) const
     {
         ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::dequantize: scale == 0");
-        float dequantized = (value - offset) * scale;
+        float dequantized = (static_cast<int>(value) - offset) * scale;
         return dequantized;
     }
 
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index b2bd7bd4ab..96e99e6874 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -792,7 +792,7 @@ inline bool is_data_type_fixed_point(DataType dt)
  *
  * @return True if data type is of symmetric quantized type, else false.
  */
-inline bool is_data_type_quantized_assymetric(DataType dt)
+inline bool is_data_type_quantized_asymmetric(DataType dt)
 {
     switch(dt)
     {
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 9c51534f78..58753c1410 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -43,14 +43,14 @@ class CLPoolingLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
-     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32.
+     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
      *
-     * @param[in] input     Source tensor info. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6efeebd63f..6ebdf298f1 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -271,9 +271,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pooling_layer_2", "pooling_layer.cl" },
     { "pooling_layer_3", "pooling_layer.cl" },
-    { "pooling_layer_3_optimized", "pooling_layer.cl" },
+    { "pooling_layer_optimized_3", "pooling_layer.cl" },
     { "pooling_layer_7", "pooling_layer.cl" },
     { "pooling_layer_N", "pooling_layer.cl" },
+    { "pooling_layer_N_quantized", "pooling_layer_quantized.cl" },
     { "quantization_layer", "quantization_layer.cl" },
     { "reduction_operation", "reduction_operation.cl" },
     { "remap_nearest_neighbour", "remap.cl" },
@@ -544,6 +545,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
     {
         "pooling_layer.cl",
 #include "./cl_kernels/pooling_layer.clembed"
+    },
+    {
+        "pooling_layer_quantized.cl",
+#include "./cl_kernels/pooling_layer_quantized.clembed"
     },
     {
         "quantization_layer.cl",
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 635c44a849..ee8ff27ab7 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -375,7 +375,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void pooling_layer_3_optimized(
+__kernel void pooling_layer_optimized_3(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
@@ -403,103 +403,6 @@ __kernel void pooling_layer_3_optimized(
 }
 #endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
 
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_7(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 8);
-    data1 = POW2_OP(data1, 8);
-    data2 = POW2_OP(data2, 8);
-    data3 = POW2_OP(data3, 8);
-    data4 = POW2_OP(data4, 8);
-    data5 = POW2_OP(data5, 8);
-    data6 = POW2_OP(data6, 8);
-#endif /* defined(POOL_L2) */
-
-    // Pool operation of all rows
-    data0 = POOL_OP(data0, data1);
-    data2 = POOL_OP(data2, data3);
-    data4 = POOL_OP(data4, data5);
-    data0 = POOL_OP(data0, data2);
-    data4 = POOL_OP(data4, data6);
-    data0 = POOL_OP(data0, data4);
-
-    // Set last element
-#if defined(POOL_AVG) || defined(POOL_L2)
-    data0.s7 = 0;
-#else  /* defined(POOL_AVG) || defined(POOL_L2) */
-    data0.s7 = data0.s6;
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-    // Reduce result
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    reduce4 = POOL_OP(data0.s0123, data0.s4567);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
-    DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = res;
-}
-
 #if defined(POOL_SIZE)
 
 // Set the initial value for the pooling operation accordingly with the data type
@@ -608,4 +511,4 @@ __kernel void pooling_layer_N(
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
 }
-#endif // defined(POOL_SIZE)
\ No newline at end of file
+#endif // defined(POOL_SIZE)
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..17448d19de
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size, upper_bound_w);
+    const int end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N_quantized(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int8 vdata = 0;
+    int  sdata = 0;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE - 8); x += 8)
+        {
+            uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
+            int8 data0  = convert_int8(data);
+            vdata       = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE; ++x)
+        {
+            uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
+            int data0  = convert_int(data);
+            sdata      = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
+    res          = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) */
+
+    // Store result
+    *(__global uchar *)output.ptr = convert_uchar(res);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 5bfc832518..adedebba53 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -101,7 +101,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
         build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
 
         // Set scale and offset of the input and output
-        if(is_data_type_quantized_assymetric(dt))
+        if(is_data_type_quantized_asymmetric(dt))
         {
             float s1 = input->info()->quantization_info().scale;
             int   o1 = input->info()->quantization_info().offset;
@@ -127,7 +127,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     }
 
     // Create kernel
-    std::string kernel_name = is_data_type_quantized_assymetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
+    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
     // Make sure _kernel is initialized before calling the parent's configure
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 53e46390c1..5f109f76af 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -84,7 +84,12 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     output_shape.set(2, weights->info()->dimension(3));
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -176,7 +181,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     else
     {
         bool     is_quantized_fixed_point = is_data_type_fixed_point(data_type);
-        bool     is_quantized_asymm       = is_data_type_quantized_assymetric(data_type);
+        bool     is_quantized_asymm       = is_data_type_quantized_asymmetric(data_type);
         DataType promoted_type            = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
 
         build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
@@ -220,7 +225,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     }
 
     // Set static kernel arguments
-    if(is_data_type_quantized_assymetric(data_type))
+    if(is_data_type_quantized_asymmetric(data_type))
     {
         int output_multiplier = 0;
         int output_shift      = 0;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 2854cd8265..1317278fb5 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
@@ -80,7 +81,12 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
         output_shape.set(0, pooled_w);
         output_shape.set(1, pooled_h);
 
-        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+        auto_init_if_empty(*output->info(),
+                           output_shape,
+                           1,
+                           input->info()->data_type(),
+                           input->info()->fixed_point_position(),
+                           input->info()->quantization_info());
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(CLPoolingLayerKernel::validate(input->info(), output->info(), pool_info));
@@ -94,80 +100,80 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     _pool_info   = pool_info;
     _border_size = BorderSize(pool_pad_y, pool_pad_x);
 
-    // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    const DataType  data_type  = input->info()->data_type();
 
-    build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+    build_opts.add_option_if(is_data_type_fixed_point(data_type),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
     if(pool_type != PoolingType::MAX)
     {
-        if(exclude_padding)
-        {
-            build_opts.emplace("-DEXCLUDE_PADDING");
-        }
-        build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x))));
-        build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y))));
-        build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)));
-        build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)));
-        build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)));
+        build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+        build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x)));
+        build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y)));
+        build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+        build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_x));
+        build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y));
     }
 
     // Create kernel
-    if((pool_size == 2) || (pool_size == 3) || (pool_size == 7))
+    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
     {
         // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
         // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
 
-        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+        int num_elems_read_per_iteration = pool_size;
         if(is_pool3x3_stride_le3)
         {
-            // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+            // Change the number of elements processed and the number of elements read per iteration
+            // for pooling 3x3 with stride less equal than 3
             _num_elems_processed_per_iteration = 4;
-            num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
+            num_elems_read_per_iteration       = pool_size * (pool_stride_x + 1);
         }
 
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
         const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
         _border_size.right  = std::max(upper_bound_w, pool_pad_x);
         _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
 
-        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
-        if(is_pool3x3_stride_le3)
-        {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
-        }
-        else
-        {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
-        }
+        std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+                                  + support::cpp11::to_string(pool_size);
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
     else // Run general case
     {
-        _num_elems_processed_per_iteration = 1;
-
         const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
         const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
         _border_size.right  = std::max(upper_bound_w, pool_pad_x);
         _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
 
-        build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)));
-        if(input->info()->data_type() == DataType::F16)
-        {
-            build_opts.emplace("-DFP16");
-        }
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts));
+        build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size));
+        build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
+
+        std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N";
+        _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+
+    // Configure the local work size (hint) from the first two dimensions of the global work size.
+    // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
+    // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
+    // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        cl::NDRange gws = ICLKernel::gws_from_window(win);
+        _lws_hint       = cl::NDRange(gws[0], gws[1], 1);
+    }
+
     AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
@@ -178,14 +184,16 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
 Error CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
+                                    "Unsupported combination of parameters!");
 
     int pool_pad_x = 0;
     int pool_pad_y = 0;
     int pool_size  = pool_info.pool_size();
     std::tie(pool_pad_x, pool_pad_y) = pool_info.pad_stride_info().pad();
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_pad_x >= pool_size) || (pool_pad_y >= pool_size)),
-                                    "Invalid pool size and pool pad combination");
+                                    "Invalid pool size and pool pad combination!");
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -230,7 +238,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, in_slice);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 6ca1a33b0c..ac360fbb3d 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -23,21 +23,33 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
     // Configure pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
+    k->set_target(CLScheduler::get().target());
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
     // Configure border depending on operation required
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+    // Quantize border in case data type is quantized asymmetric data type
+    uint32_t border_value = 0;
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    {
+        border_value = static_cast<uint32_t>(input->info()->quantization_info().quantize(0));
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(border_value));
 }
 
 Error CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7268d8eab5..a059f9e5fd 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -62,7 +62,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
 
     // Configure kernels
     // TODO (COMPMID-661): Remove legacy path once the new one is properly validated
-    _run_legacy_path = is_data_type_quantized_assymetric(input->info()->data_type());
+    _run_legacy_path = is_data_type_quantized_asymmetric(input->info()->data_type());
     if(_run_legacy_path)
     {
         _max_kernel.configure(input, &_max);
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index ee136447ee..c2eee8b616 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -477,6 +477,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
     switch(tensor.data_type())
     {
         case DataType::U8:
+        case DataType::QASYMM8:
         {
             std::uniform_int_distribution<uint8_t> distribution_u8(std::numeric_limits<uint8_t>::lowest(), std::numeric_limits<uint8_t>::max());
             fill(tensor, distribution_u8, seed_offset);
@@ -564,6 +565,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
     switch(tensor.data_type())
     {
         case DataType::U8:
+        case DataType::QASYMM8:
         {
             ARM_COMPUTE_ERROR_ON(!(std::is_same<uint8_t, D>::value));
             std::uniform_int_distribution<uint8_t> distribution_u8(low, high);
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index 7038f2c34e..b3d56122db 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -43,19 +43,26 @@ namespace validation
 {
 namespace
 {
-/** Input data set for float data types */
+/** Input data set for floating-point data types */
 const auto PoolingLayerDatasetFP = combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { 2, 3, 4, 7, 9 })),
                                                    framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
                                            framework::dataset::make("ExcludePadding", { true, false }));
 
-/** Input data set for quantized data types */
+/** Input data set for fixed-point data types */
 const auto PoolingLayerDatasetQS = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3 })),
                                                    framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
                                            framework::dataset::make("ExcludePadding", { true, false }));
-constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */
-constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for float types */
-constexpr AbsoluteTolerance<float> tolerance_qs8(3);      /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
-constexpr AbsoluteTolerance<float> tolerance_qs16(6);     /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+
+/** Input data set for asymmetric data type */
+const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3 })),
+                                                        framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
+                                                framework::dataset::make("ExcludePadding", { true, false }));
+
+constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float>   tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+constexpr AbsoluteTolerance<float>   tolerance_qs16(6);     /**< Tolerance value for comparing reference's output against implementation's output for 16-bit fixed-point type */
+constexpr AbsoluteTolerance<float>   tolerance_qs8(3);      /**< Tolerance value for comparing reference's output against implementation's output for 8-bit fixed-point type */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);  /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
 } // namespace
 
 TEST_SUITE(CL)
@@ -64,12 +71,13 @@ TEST_SUITE(PoolingLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Mismatching data type
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS8, 4),
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS8, 4),     // Mismatching fixed point position
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS16, 11),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Invalid pad/size combination
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Invalid pad/size combination
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8, 0), // Invalid parameters
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16, 0),
                                                        TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
@@ -77,6 +85,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::QS16, 11),
                                                        TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32, 0),
                                                        TensorInfo(TensorShape(25U, 16U, 2U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8, 0),
                                                      })),
                framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 0, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 0, 0)),
@@ -84,8 +93,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(1, 1, 0, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, PadStrideInfo(1, 1, 2, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, PadStrideInfo(1, 1, 0, 2)),
+                                                       PoolingLayerInfo(PoolingType::L2, 3, PadStrideInfo(1, 1, 0, 0)),
                                                       })),
-               framework::dataset::make("Expected", { true, false, true, false, true, true})),
+               framework::dataset::make("Expected", { true, false, true, false, true, true, true })),
                input_info, output_info, pool_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPoolingLayer::validate(&input_info, &output_info, pool_info)) == expected, framework::LogLevel::ERRORS);
@@ -131,7 +141,7 @@ TEST_SUITE_END()
 template <typename T>
 using CLPoolingLayerFixedPointFixture = PoolingLayerValidationFixedPointFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
 
-TEST_SUITE(Quantized)
+TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQS,
                                                                                                                        framework::dataset::make("DataType", DataType::QS8))),
@@ -167,6 +177,31 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixedPointFixture<int16_t>, frame
 TEST_SUITE_END()
 TEST_SUITE_END()
 
+TEST_SUITE(Quantized)
+
+template <typename T>
+using CLPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8,
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127),
+                                                                                                                       QuantizationInfo(7.f / 255, 123)
+                                                                                                                                                            })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
+                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
diff --git a/tests/validation/CPP/PoolingLayer.cpp b/tests/validation/CPP/PoolingLayer.cpp
index 4f755ce2c4..90a48e0c44 100644
--- a/tests/validation/CPP/PoolingLayer.cpp
+++ b/tests/validation/CPP/PoolingLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -277,6 +278,15 @@ SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, PoolingLayerInfo info)
     return dst;
 }
 
+template <>
+SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, PoolingLayerInfo info)
+{
+    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>   dst_tmp = pooling_layer<float>(src_tmp, info);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, src.quantization_info());
+    return dst;
+}
+
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, PoolingLayerInfo info);
 template SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, PoolingLayerInfo info);
 template SimpleTensor<qint8_t> pooling_layer(const SimpleTensor<qint8_t> &src, PoolingLayerInfo info);
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 23ad62a6c3..3ef5fc1cc5 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -112,6 +112,27 @@ HarrisCornersParameters harris_corners_parameters()
 
     return params;
 }
+
+SimpleTensor<float> convert_from_asymmetric(const SimpleTensor<uint8_t> &src)
+{
+    const QuantizationInfo &quantization_info = src.quantization_info();
+    SimpleTensor<float>     dst{ src.shape(), DataType::F32, 1, 0 };
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        dst[i] = quantization_info.dequantize(src[i]);
+    }
+    return dst;
+}
+
+SimpleTensor<uint8_t> convert_to_asymmetric(const SimpleTensor<float> &src, const QuantizationInfo &quantization_info)
+{
+    SimpleTensor<uint8_t> dst{ src.shape(), DataType::QASYMM8, 1, 0, quantization_info };
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        dst[i] = quantization_info.quantize(src[i]);
+    }
+    return dst;
+}
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index eecf976a13..6b1c4b9026 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -201,6 +201,23 @@ std::pair<T, T> get_batchnormalization_layer_test_bounds(int fixed_point_positio
 
     return bounds;
 }
+
+/** Convert quantized simple tensor into float using tensor quantization information.
+ *
+ * @param[in] src Quantized tensor.
+ *
+ * @return Float tensor.
+*/
+SimpleTensor<float> convert_from_asymmetric(const SimpleTensor<uint8_t> &src);
+
+/** Convert float simple tensor into quantized using specified quantization information.
+ *
+ * @param[in] src               Float tensor.
+ * @param[in] quantization_info Quantification information.
+ *
+ * @return Quantized tensor.
+*/
+SimpleTensor<uint8_t> convert_to_asymmetric(const SimpleTensor<float> &src, const QuantizationInfo &quantization_info);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index 09b9e0ef1a..d6190e2977 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h
@@ -43,28 +43,34 @@ namespace test
 namespace validation
 {
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class PoolingLayerValidationFixedPointFixture : public framework::Fixture
+class PoolingLayerValidationGenericFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, PoolingType pool_type, int pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, int fractional_bits)
+    void setup(TensorShape shape, PoolingType pool_type, int pool_size, PadStrideInfo pad_stride_info, bool exclude_padding,
+               DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
     {
-        _fractional_bits = fractional_bits;
+        _fractional_bits   = fractional_bits;
+        _quantization_info = quantization_info;
         PoolingLayerInfo info(pool_type, pool_size, pad_stride_info, exclude_padding);
 
-        _target    = compute_target(shape, info, data_type, fractional_bits);
-        _reference = compute_reference(shape, info, data_type, fractional_bits);
+        _target    = compute_target(shape, info, data_type, fractional_bits, quantization_info);
+        _reference = compute_reference(shape, info, data_type, fractional_bits, quantization_info);
     }
 
 protected:
     template <typename U>
     void fill(U &&tensor)
     {
-        if(_fractional_bits == 0)
+        if(!is_data_type_quantized(tensor.data_type()))
         {
             std::uniform_real_distribution<> distribution(-1.f, 1.f);
             library->fill(tensor, distribution, 0);
         }
+        else if(is_data_type_quantized_asymmetric(tensor.data_type()))
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
         else
         {
             const int                       one_fixed = 1 << _fractional_bits;
@@ -73,10 +79,11 @@ protected:
         }
     }
 
-    TensorType compute_target(const TensorShape &shape, PoolingLayerInfo info, DataType data_type, int fixed_point_position = 0)
+    TensorType compute_target(const TensorShape &shape, PoolingLayerInfo info,
+                              DataType data_type, int fixed_point_position, QuantizationInfo quantization_info)
     {
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position);
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position, quantization_info);
         TensorType dst;
 
         // Create and configure function
@@ -102,10 +109,11 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, PoolingLayerInfo info, DataType data_type, int fixed_point_position = 0)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, PoolingLayerInfo info,
+                                      DataType data_type, int fixed_point_position, QuantizationInfo quantization_info)
     {
         // Create reference
-        SimpleTensor<T> src{ shape, data_type, 1, fixed_point_position };
+        SimpleTensor<T> src{ shape, data_type, 1, fixed_point_position, quantization_info };
 
         // Fill reference
         fill(src);
@@ -113,30 +121,56 @@ protected:
         return reference::pooling_layer<T>(src, info);
     }
 
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    int             _fractional_bits{};
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    int              _fractional_bits{};
+    QuantizationInfo _quantization_info{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class PoolingLayerValidationFixture : public PoolingLayerValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>
+class PoolingLayerValidationFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
     template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, int pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type)
     {
-        PoolingLayerValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, pool_type, pool_size, pad_stride_info, exclude_padding, data_type, 0);
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, pool_type, pool_size, pad_stride_info, exclude_padding,
+                                                                                               data_type, 0, QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class PoolingLayerValidationFixedPointFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, int pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, int fractional_bits)
+    {
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, pool_type, pool_size, pad_stride_info, exclude_padding,
+                                                                                               data_type, fractional_bits, QuantizationInfo());
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class PoolingLayerValidationQuantizedFixture : public PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, int pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, QuantizationInfo quantization_info)
+    {
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, pool_type, pool_size, pad_stride_info, exclude_padding,
+                                                                                               data_type, 0, quantization_info);
     }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class GlobalPoolingLayerValidationFixture : public PoolingLayerValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>
+class GlobalPoolingLayerValidationFixture : public PoolingLayerValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
     template <typename...>
     void setup(TensorShape shape, PoolingType pool_type, DataType data_type)
     {
-        PoolingLayerValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, pool_type, shape.x(), PadStrideInfo(1, 1, 0, 0), true, data_type, 0);
+        PoolingLayerValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, pool_type, shape.x(), PadStrideInfo(1, 1, 0, 0), true, data_type);
     }
 };
 } // namespace validation
-- 
cgit v1.2.1