From 5518671926c2c493e023a2e0d78b4aef4cb0dcec Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 8 Jan 2018 17:37:12 +0000
Subject: COMPMID-785: Add QASYMM8 support for pooling layer

Adds generic pooling case for QASYMM8

Change-Id: I37d38a92ca61651e915fbbbb6da88e180390b4ab
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115439
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/NEON/kernels/NEFillBorderKernel.h |   4 +-
 .../core/NEON/kernels/NEPoolingLayerKernel.h       |  28 +-
 .../runtime/NEON/functions/NEPoolingLayer.h        |   6 +-
 src/core/NEON/kernels/NEFillBorderKernel.cpp       |   9 +-
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp     | 622 ++++++++++++++++++---
 src/core/Utils.cpp                                 |   2 +
 src/runtime/CL/functions/CLPoolingLayer.cpp        |  12 +-
 src/runtime/NEON/functions/NEPoolingLayer.cpp      |  11 +-
 tests/benchmark/NEON/PoolingLayer.cpp              |   6 +-
 tests/validation/NEON/PoolingLayer.cpp             |  43 +-
 10 files changed, 637 insertions(+), 106 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
index 4e003243ba..164158bb79 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@ public:
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QS16/S16/S32/F32.
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QASYMM8/QS16/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 87d14e5f91..19c4656679 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ public:
      *
      * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
@@ -59,7 +59,7 @@ public:
      *
      * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
@@ -86,7 +86,6 @@ private:
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_f16(const Window &window_input, const Window &window);
-
     /** Function to perform 2x2 pooling for 8bit fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -94,6 +93,13 @@ private:
      */
     template <PoolingType pooling_type>
     void pooling2_q8(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for 8bit asymmetric fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type, bool exclude_padding = false>
+    void pooling2_qasymm8(const Window &window_input, const Window &window);
     /** Function to perform 2x2 pooling for 16bit fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -122,6 +128,13 @@ private:
      */
     template <PoolingType pooling_type>
     void pooling3_q8(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling for 8bit quantized fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type, bool exclude_padding = false>
+    void pooling3_qasymm8(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling for 16bit fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -142,6 +155,13 @@ private:
      * @param[in] window       Output region on which to execute the kernel.
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
+    void poolingN_qasymm8(const Window &window_input, const Window &window);
+    /** Function to perform NxN pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void poolingN_f32(const Window &window_input, const Window &window);
     /** Common signature for all the specialised Pooling functions
      *
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 0f8abb587d..3ac0844798 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@ public:
      *
      * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32.
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
@@ -57,7 +57,7 @@ public:
      *
      * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32.
+     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index af04955608..747b8b1bfe 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,7 +105,10 @@ NEFillBorderKernel::NEFillBorderKernel()
 
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QASYMM8,
+                                                  DataType::QS16, DataType::U16, DataType::S16,
+                                                  DataType::U32, DataType::S32,
+                                                  DataType::F16, DataType::F32);
 
     _tensor                = tensor;
     _border_size           = border_size;
@@ -140,6 +143,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
         {
             switch(_tensor->info()->data_type())
             {
+                case DataType::QASYMM8:
                 case DataType::U8:
                     fill_constant_value_single_channel<uint8_t>(window);
                     break;
@@ -184,6 +188,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
         {
             switch(_tensor->info()->data_type())
             {
+                case DataType::QASYMM8:
                 case DataType::U8:
                     fill_replicate_single_channel<uint8_t>(window);
                     break;
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 47372c2d5d..ac183d2f30 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,6 +36,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "support/ToolchainSupport.h"
+
 #include <algorithm>
 #include <arm_neon.h>
 #include <cmath>
@@ -98,6 +101,56 @@ inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, in
     return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
 }
 
+template <bool exclude_padding>
+inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
+                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = (id.x() + id_offset) * stride_x - pad_x;
+    int       start_y = id.y() * stride_y - pad_y;
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    if(exclude_padding)
+    {
+        start_y = std::max(0, start_y);
+    }
+
+    std::array<uint16_t, 8> elems =
+    {
+        {
+            vgetq_lane_u16(v, 0),
+            vgetq_lane_u16(v, 1),
+            vgetq_lane_u16(v, 2),
+            vgetq_lane_u16(v, 3),
+            vgetq_lane_u16(v, 4),
+            vgetq_lane_u16(v, 5),
+            vgetq_lane_u16(v, 6),
+            vgetq_lane_u16(v, 7),
+        }
+    };
+
+    for(auto &el : elems)
+    {
+        int       c_start_x = start_x;
+        const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
+        if(exclude_padding)
+        {
+            c_start_x = std::max(0, c_start_x);
+        }
+        float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
+        el *= scale;
+        start_x += step * stride_x;
+    }
+
+    v = vsetq_lane_u16(elems[0], v, 0);
+    v = vsetq_lane_u16(elems[1], v, 1);
+    v = vsetq_lane_u16(elems[2], v, 2);
+    v = vsetq_lane_u16(elems[3], v, 3);
+    v = vsetq_lane_u16(elems[4], v, 4);
+    v = vsetq_lane_u16(elems[5], v, 5);
+    v = vsetq_lane_u16(elems[6], v, 6);
+    v = vsetq_lane_u16(elems[7], v, 7);
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -114,9 +167,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     static const std::set<int> supported_pool_sizes = { 2, 3 };
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->data_type() != DataType::F32));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
     ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
     ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
@@ -185,6 +238,26 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
             }
             num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
             break;
+        case DataType::QASYMM8:
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_read_per_iteration      = 16;
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                    num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                    break;
+                case 3:
+                    num_elems_read_per_iteration      = 16;
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                    num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                    break;
+                default:
+                    num_elems_read_per_iteration      = 1;
+                    num_elems_processed_per_iteration = 1;
+                    num_elems_horizontal_window       = 1;
+                    break;
+            }
+            break;
         case DataType::QS16:
             num_elems_read_per_iteration = 8;
             switch(pool_size)
@@ -328,12 +401,15 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     _output    = output;
     _pool_info = pool_info;
 
+    // Get data type
+    const DataType data_type = input->info()->data_type();
+
     // Select appropriate function
-    switch(pool_size)
+    if(data_type == DataType::QS8)
     {
-        case 2:
-            if(input->info()->data_type() == DataType::QS8)
-            {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -345,9 +421,74 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
+                break;
+            case 3:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+        }
+    }
+    else if(data_type == DataType::QASYMM8)
+    {
+        if(pool_size == 2 && pool_stride_x < 3)
+        {
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
+        }
+        else if(pool_size == 3 && pool_stride_x < 3)
+        {
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
             }
-            else if(input->info()->data_type() == DataType::QS16)
+        }
+        else
+        {
+            switch(pool_type)
             {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
+        }
+    }
+    else if(data_type == DataType::QS16)
+    {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -359,9 +500,29 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F16)
-            {
+                break;
+            case 3:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+        }
+    }
+    else if(data_type == DataType::F16)
+    {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -376,121 +537,96 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F32)
-            {
+                break;
+            case 3:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            break;
-        case 3:
-            if(input->info()->data_type() == DataType::QS8)
-            {
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+        }
+    }
+    else if(data_type == DataType::F32)
+    {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+                        break;
+                    case PoolingType::L2:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::QS16)
-            {
+                break;
+            case 3:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
+                        break;
+                    case PoolingType::L2:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F16)
-            {
+                break;
+            case 7:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F32)
-            {
+                break;
+            default:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            break;
-        case 7:
-            switch(pool_type)
-            {
-                case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
-                    break;
-                case PoolingType::L2:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
-                    break;
-                case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
-            }
-            break;
-        default:
-            switch(pool_type)
-            {
-                case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
-                    break;
-                case PoolingType::L2:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
-                    break;
-                case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
-            }
-            break;
+                break;
+        }
     }
 
     // Configure kernel window
@@ -563,6 +699,119 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size     = 2;
+    int           pool_pad_x    = 0;
+    int           pool_pad_y    = 0;
+    int           pool_stride_x = 0;
+    int           pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
+        const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+        uint8x8_t  lower_res   = {};
+        uint8x8_t  upper_res   = {};
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            const uint16x8x2_t top_data_u16    = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
+            const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
+
+            // Add rows
+            const uint16x8x2_t vrsum =
+            {
+                {
+                    vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
+                    vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
+                }
+            };
+
+            // Pair-wise add row data
+            const uint16x4x2_t vpsum =
+            {
+                {
+                    vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
+                    vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
+                }
+            };
+
+            uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
+
+            // Scale lower result
+            scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
+                                                pool_size, upper_bound_w, upper_bound_h,
+                                                pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            lower_res = vmovn_u16(res_lower);
+
+            // Compute upper result for stride_x == 1
+            if(pool_stride_x == 1)
+            {
+                // Shifted row sum
+                const uint16x8x2_t vrsum_shifted =
+                {
+                    {
+                        vextq_u16(vrsum.val[0], vrsum.val[1], 1),
+                        vextq_u16(vrsum.val[1], vrsum.val[1], 1)
+                    }
+                };
+
+                // Pair-wise add shifted row
+                const uint16x4x2_t vpsum_shifted =
+                {
+                    {
+                        vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
+                        vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
+                    }
+                };
+                uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
+
+                // Scale lower result
+                scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                upper_res = vmovn_u16(res_upper);
+            }
+        }
+        else
+        {
+            const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
+            lower_res                 = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
+            if(pool_stride_x == 1)
+            {
+                const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
+                upper_res                         = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
+            }
+        }
+
+        // Store result
+        if(pool_stride_x == 1)
+        {
+            const uint8x8x2_t res = { { lower_res, upper_res } };
+            vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+        }
+        else
+        {
+            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
+        }
+    },
+    input, output);
+}
+
 template <PoolingType pooling_type>
 void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
 {
@@ -892,6 +1141,125 @@ void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size     = 3;
+    int           pool_pad_x    = 0;
+    int           pool_pad_y    = 0;
+    int           pool_stride_x = 0;
+    int           pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
+        const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
+        const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Convert data to u16
+            const uint16x8x2_t top_data_u16    = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
+            const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
+            const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
+
+            // Calculate row sums
+            const uint16x8x2_t vrsum =
+            {
+                {
+                    vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
+                    vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
+                }
+            };
+            const uint16x8x2_t vrsum_shifted_1 =
+            {
+                {
+                    vextq_u16(vrsum.val[0], vrsum.val[1], 1),
+                    vextq_u16(vrsum.val[1], vrsum.val[1], 1)
+                }
+            };
+            const uint16x8x2_t vrsum_shifted_2 =
+            {
+                {
+                    vextq_u16(vrsum.val[0], vrsum.val[1], 2),
+                    vextq_u16(vrsum.val[1], vrsum.val[1], 2)
+                }
+            };
+            // Calculate final sum
+            uint16x8x2_t final_sum =
+            {
+                {
+                    vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
+                    vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                }
+            };
+            if(pool_stride_x == 2)
+            {
+                uint16x8_t res =
+                {
+                    vgetq_lane_u16(final_sum.val[0], 0),
+                    vgetq_lane_u16(final_sum.val[0], 2),
+                    vgetq_lane_u16(final_sum.val[0], 4),
+                    vgetq_lane_u16(final_sum.val[0], 6),
+                    vgetq_lane_u16(final_sum.val[1], 0),
+                    vgetq_lane_u16(final_sum.val[1], 2),
+                    vgetq_lane_u16(final_sum.val[1], 4),
+                    vgetq_lane_u16(final_sum.val[1], 6),
+                };
+
+                scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
+            }
+            else
+            {
+                // Scale lower result
+                scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                // Scale lower result
+                scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
+                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+            }
+        }
+        else
+        {
+            const uint8x16_t max_data        = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
+            const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
+            const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
+            const uint8x16_t final_max       = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
+
+            if(pool_stride_x == 2)
+            {
+                const uint8x8x2_t      table      = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
+                static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                const uint8x8_t        res        = vtbl2_u8(table, lookup_val);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+            }
+            else
+            {
+                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
+            }
+        }
+    },
+    input, output);
+}
+
 template <PoolingType pooling_type>
 void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
 {
@@ -1232,6 +1600,98 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
+    int       pool_pad_x    = 0;
+    int       pool_pad_y    = 0;
+    int       pool_stride_x = 0;
+    int       pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8_t res = 0;
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            uint32x4_t vres = vdupq_n_u32(0);
+            uint32_t   sres = 0;
+
+            // Calculate scale
+            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+
+            // Perform pooling
+            for(int y = 0; y < pool_size; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size - 8); x += 8)
+                {
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+
+                    const uint16x8_t data_u16 = vmovl_u8(data);
+                    vres                      = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
+                }
+
+                // Leftover for loop
+                for(; x < pool_size; ++x)
+                {
+                    uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    sres += data;
+                }
+            }
+
+            // Reduction
+            const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
+            sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
+
+            // Divide by scale
+            res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
+        }
+        else
+        {
+            uint8x8_t vres = vdup_n_u8(0);
+            res            = 0;
+
+            for(int y = 0; y < pool_size; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size - 8); x += 8)
+                {
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    vres                 = vmax_u8(vres, data);
+                }
+
+                // Leftover for loop
+                for(; x < pool_size; ++x)
+                {
+                    const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    res                = std::max(res, data);
+                }
+            }
+
+            // Reduce max
+            vres = vpmax_u8(vres, vres);
+            vres = vpmax_u8(vres, vres);
+            vres = vpmax_u8(vres, vres);
+
+            // Get max value
+            res = std::max(res, vget_lane_u8(vres, 0));
+        }
+
+        // Store result
+        *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
+    },
+    input, output);
+}
+
 Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
@@ -1269,6 +1729,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
 
     const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
     const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
+    const unsigned int pool_size     = _pool_info.pool_size();
 
     // Set step for input in x and y direction for the input
     Window       window_input(window);
@@ -1282,6 +1743,15 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
             window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
             break;
         }
+        case DataType::QASYMM8:
+        {
+            window_x_inc = pool_stride_x;
+            if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+            {
+                window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+            }
+            break;
+        }
         case DataType::F32:
         {
             window_x_inc = pool_stride_x;
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index a8249c4840..83a843de58 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -320,6 +320,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const
 {
     switch(dt)
     {
+        case DataType::QASYMM8:
         case DataType::U8:
             print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
             break;
@@ -355,6 +356,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
 {
     switch(dt)
     {
+        case DataType::QASYMM8:
         case DataType::U8:
             return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
         case DataType::QS8:
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 2341633362..201bf87b47 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,16 +40,14 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
-    // Configure border depending on operation required
+    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    // Quantize border in case data type is quantized asymmetric data type
-    uint32_t border_value = 0;
+    PixelValue zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
     {
-        border_value = static_cast<uint32_t>(input->info()->quantization_info().quantize(0.f, RoundingPolicy::TO_NEAREST_UP));
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
-
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(border_value));
+    _border_handler.configure(input, _kernel->border_size(), border_mode, zero_value);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 530c7fca4a..8a32507a73 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,9 +43,14 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     // Configure pooling kernel
     _pooling_layer_kernel.configure(input, output, pool_info);
 
-    // Configure border depending on operation required
+    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
     BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
+    PixelValue zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    {
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+    }
+    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
 }
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/tests/benchmark/NEON/PoolingLayer.cpp b/tests/benchmark/NEON/PoolingLayer.cpp
index 4815959ae6..46f9dfe361 100644
--- a/tests/benchmark/NEON/PoolingLayer.cpp
+++ b/tests/benchmark/NEON/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,9 +46,9 @@ namespace test
 namespace
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32, DataType::QS8 });
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32, DataType::QS8, DataType::QASYMM8 });
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto data_types = framework::dataset::make("DataType", { DataType::F32, DataType::QS8 });
+const auto data_types = framework::dataset::make("DataType", { DataType::F32, DataType::QS8, DataType::QASYMM8 });
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace
 
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 79a732a59a..86fd802d13 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,12 +53,18 @@ const auto PoolingLayerDatasetQS = combine(combine(combine(framework::dataset::m
                                                    framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
                                            framework::dataset::make("ExcludePadding", { false }));
 
+/** Input data set for asymmetric data type */
+const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { 2, 3, 9 })),
+                                                        framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
+                                                framework::dataset::make("ExcludePadding", { true, false }));
+
 constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<float> tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for float types */
-#endif                                                   /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qs8(0);     /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
-constexpr AbsoluteTolerance<float> tolerance_qs16(0);    /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);   /**< Tolerance value for comparing reference's output against implementation's output for float types */
+#endif                                                     /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float>   tolerance_qs8(0);     /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+constexpr AbsoluteTolerance<float>   tolerance_qs16(0);    /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
 } // namespace
 
 TEST_SUITE(NEON)
@@ -148,7 +154,7 @@ TEST_SUITE_END()
 template <typename T>
 using NEPoolingLayerFixedPointFixture = PoolingLayerValidationFixedPointFixture<Tensor, Accessor, NEPoolingLayer, T>;
 
-TEST_SUITE(Quantized)
+TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQS,
                                                                                                                        framework::dataset::make("DataType", DataType::QS8))),
@@ -184,6 +190,31 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixedPointFixture<int16_t>, frame
 TEST_SUITE_END()
 TEST_SUITE_END()
 
+TEST_SUITE(Quantized)
+
+template <typename T>
+using NEPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8,
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127),
+                                                                                                                       QuantizationInfo(7.f / 255, 123)
+                                                                                                                                                            })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
+                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
-- 
cgit v1.2.1