From 0c34fe29c298057091d48cde332cb60bb14efee1 Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Mon, 26 Jun 2017 17:17:42 +0100
Subject: COMPMID-421: Added FP16 support in Pooling Layer

Change-Id: I6b6119c8770051c1656da40aa073c539c15b493e
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78985
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
---
 .../core/NEON/kernels/NEPoolingLayerKernel.h       |  17 ++-
 arm_compute/core/PixelValue.h                      |  48 ++++--
 scripts/check_clang-tidy.py                        |   1 +
 scripts/clang-tidy.h                               |  23 ++-
 src/core/NEON/kernels/NEFillBorderKernel.cpp       |  18 ++-
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp     | 166 +++++++++++++++++++--
 tests/benchmark_new/NEON/PoolingLayer.cpp          |  23 +--
 tests/validation/NEON/PoolingLayer.cpp             |  31 +++-
 tests/validation/Reference.cpp                     |   1 +
 tests/validation/TensorOperations.h                |  22 ++-
 10 files changed, 306 insertions(+), 44 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index bf06fdd639..a5de81137b 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -48,7 +48,7 @@ public:
     ~NEPoolingLayerKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/F32.
+     * @param[in]  input     Source tensor. Data types supported: QS8/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
@@ -66,6 +66,14 @@ private:
      */
     template <PoolingType pooling_type>
     void pooling2_f32(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for float16_t.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_f16(const Window &window_input, const Window &window);
+
     /** Function to perform 2x2 pooling for 8bit fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -80,6 +88,13 @@ private:
      */
     template <PoolingType pooling_type>
     void pooling3_f32(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_f16(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling for 8bit fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index b4912ce15a..1b1a5a3845 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -26,6 +26,10 @@
 
 #include <cstdint>
 
+#if ARM_COMPUTE_ENABLE_FP16
+#include <arm_fp16.h> // needed for float16_t
+#endif                /* ARM_COMPUTE_ENABLE_FP16 */
+
 namespace arm_compute
 {
 /** Class describing the value of a pixel for any image format. */
@@ -82,6 +86,17 @@ public:
     {
         value.s32 = v;
     }
+#if ARM_COMPUTE_ENABLE_FP16
+    /** Initialize the union with a F16 pixel value
+     *
+     * @param[in] v F16 value.
+     */
+    PixelValue(float16_t v)
+        : PixelValue()
+    {
+        value.f16 = v;
+    }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
     /** Initialize the union with a F32 pixel value
      *
      * @param[in] v F32 value.
@@ -96,16 +111,19 @@ public:
      */
     union
         {
-            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
-            float    f32;     /**< Single channel float 32 */
-            uint8_t  u8;      /**< Single channel U8 */
-            int8_t   s8;      /**< Single channel S8 */
-            uint16_t u16;     /**< Single channel U16 */
-            int16_t  s16;     /**< Single channel S16 */
-            uint32_t u32;     /**< Single channel U32 */
-            int32_t  s32;     /**< Single channel S32 */
+            uint8_t rgb[3];  /**< 3 channels: RGB888 */
+            uint8_t yuv[3];  /**< 3 channels: Any YUV format */
+            uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */
+            float   f32;     /**< Single channel float 32 */
+#if ARM_COMPUTE_ENABLE_FP16
+            float16_t f16; /**< Single channel F16 */
+#endif                 /* ARM_COMPUTE_ENABLE_FP16 */
+            uint8_t  u8;   /**< Single channel U8 */
+            int8_t   s8;   /**< Single channel S8 */
+            uint16_t u16;  /**< Single channel U16 */
+            int16_t  s16;  /**< Single channel S16 */
+            uint32_t u32;  /**< Single channel U32 */
+            int32_t  s32;  /**< Single channel S32 */
         } value;
     /** Interpret the pixel value as a U8
      *
@@ -155,6 +173,16 @@ public:
     {
         v = value.s32;
     }
+#if ARM_COMPUTE_ENABLE_FP16
+    /** Interpret the pixel value as a F16
+     *
+     * @param[out] v Returned value
+     */
+    void get(float16_t &v) const
+    {
+        v = value.f16;
+    }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
     /** Interpret the pixel value as a F32
      *
      * @param[out] v Returned value
diff --git a/scripts/check_clang-tidy.py b/scripts/check_clang-tidy.py
index b24f5f7f8f..6c2173b6fe 100755
--- a/scripts/check_clang-tidy.py
+++ b/scripts/check_clang-tidy.py
@@ -46,6 +46,7 @@ if __name__ == "__main__":
                    ("ReferenceCPP.cpp" in line and "parameter 'srcs' is unused" in line) or
                    ("ReferenceCPP.cpp" in line and re.search(r"parameter '[^']+' is unused", line)) or
                    ("NEGEMMMatrixMultiplyKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
+                   ("NEPoolingLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
                    "3rdparty" in line):
                     continue
 
diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h
index cbc0d07cd6..ccf267e9b8 100644
--- a/scripts/clang-tidy.h
+++ b/scripts/clang-tidy.h
@@ -1,5 +1,15 @@
 #include <arm_neon.h>
 
+inline float16x4_t vpmax_f16 (float16x4_t, float16x4_t)
+{
+  return vdup_n_f16(0);
+}
+
+inline float16x4_t vpadd_f16 (float16x4_t, float16x4_t)
+{
+  return vdup_n_f16(0);
+}
+
 inline float16x8_t vmulq_lane_f16 (float16x8_t, float16x4_t, const int)
 {
   return vdupq_n_f16(0);
@@ -7,22 +17,27 @@ inline float16x8_t vmulq_lane_f16 (float16x8_t, float16x4_t, const int)
 
 inline float16x4_t vmul_f16 (float16x4_t, float16x4_t)
 {
-  return vdup_n_u16(0);
+  return vdup_n_f16(0);
 }
 
 inline float16x4_t vadd_f16 (float16x4_t, float16x4_t)
 {
-  return vdup_n_u16(0);
+  return vdup_n_f16(0);
 }
 
 inline float16x4_t vmul_lane_f16 (float16x4_t, float16x4_t, const int)
 {
-  return vdup_n_u16(0);
+  return vdup_n_f16(0);
 }
 
 inline float16x4_t vmul_n_f16 (float16x4_t, float16_t)
 {
-  return vdup_n_u16(0);
+  return vdup_n_f16(0);
+}
+
+inline float16x4_t vmax_f16(float16x4_t, float16x4_t)
+{
+  return vdup_n_f16(0);
 }
 
 inline float16x8_t vcvtq_f16_u16(uint16x8_t)
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index bd99242b11..cd84e36aad 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -33,6 +33,10 @@
 #include <algorithm>
 #include <cstdint>
 
+#if ARM_COMPUTE_ENABLE_FP16
+#include <arm_fp16.h> // needed for float16_t
+#endif                /* ARM_COMPUTE_ENABLE_FP16 */
+
 using namespace arm_compute;
 
 namespace arm_compute
@@ -47,7 +51,7 @@ NEFillBorderKernel::NEFillBorderKernel()
 
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
 
     _tensor                = tensor;
     _border_size           = border_size;
@@ -100,6 +104,12 @@ void NEFillBorderKernel::run(const Window &window)
                 case DataType::S32:
                     fill_constant_value_single_channel<int32_t>(window);
                     break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
+                    static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
+                    fill_constant_value_single_channel<float16_t>(window);
+                    break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     fill_constant_value_single_channel<float>(window);
@@ -133,6 +143,12 @@ void NEFillBorderKernel::run(const Window &window)
                 case DataType::S32:
                     fill_replicate_single_channel<int32_t>(window);
                     break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                case DataType::F16:
+                    static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
+                    fill_replicate_single_channel<float16_t>(window);
+                    break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     fill_replicate_single_channel<float>(window);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 2ef2b9881f..ce977140fb 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -48,17 +48,17 @@ namespace
 inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    int start_x = id.x() * stride_x - pad_x;
-    int start_y = id.y() * stride_y - pad_y;
-    int end_x   = std::min(start_x + pool_size, upper_bound_w);
-    int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int start_x = id.x() * stride_x - pad_x;
+    const int start_y = id.y() * stride_y - pad_y;
+    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
 
 inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
                                       int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
 {
-    static std::array<qint8_t, 10> scale_values_q8 =
+    static const std::array<qint8_t, 10> scale_values_q8 =
     { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
     const int start_x = id.x() * stride_x - pad_x;
     const int start_y = id.y() * stride_y - pad_y;
@@ -96,8 +96,11 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
     ARM_COMPUTE_UNUSED(supported_pool_sizes);
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
     ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
     ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
@@ -140,9 +143,30 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Pooling size not supported");
+                    break;
             }
             num_elems_horizontal_window = 8;
             break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        case DataType::F16:
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_read_per_iteration      = 16;
+                    num_elems_processed_per_iteration = 8;
+                    num_elems_horizontal_window       = 8;
+                    break;
+                case 3:
+                    num_elems_read_per_iteration      = 4;
+                    num_elems_processed_per_iteration = 1;
+                    num_elems_horizontal_window       = 1;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Pooling size not supported");
+                    break;
+            }
+            break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
         case DataType::F32:
             switch(pool_size)
             {
@@ -157,6 +181,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Pooling size not supported");
+                    break;
             }
             num_elems_processed_per_iteration = 1;
             num_elems_horizontal_window       = 1;
@@ -188,6 +213,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
             {
                 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
             }
+            else if(input->info()->data_type() == DataType::F16)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+            }
             else if(input->info()->data_type() == DataType::F32)
             {
                 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
@@ -198,6 +227,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
             {
                 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
             }
+            else if(input->info()->data_type() == DataType::F16)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+            }
             else if(input->info()->data_type() == DataType::F32)
             {
                 _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
@@ -265,6 +298,101 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window
     input, output);
 }
 
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size     = 3;
+    int                 pool_pad_x    = 0;
+    int                 pool_pad_y    = 0;
+    int                 pool_stride_x = 0;
+    int                 pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+        const float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
+        const float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        float16x4_t       res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float16x4_t scale_v = vdup_n_f16(scale);
+            // Perform pooling
+            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
+        }
+        else
+        {
+            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+            res                        = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
+            res                        = vpmax_f16(res, res);
+        }
+        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+    },
+    input, output);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    Iterator      input(_input, window_input);
+    Iterator      output(_output, window);
+    constexpr int pool_size = 2;
+    int           pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto  top_data    = vld2q_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+        const auto  bottom_data = vld2q_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+        float16x8_t res         = {};
+
+        if(pooling_type == PoolingType::AVG)
+        {
+            const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float16x8_t scale_v = vdupq_n_f16(scale);
+            res                       = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
+        }
+        else
+        {
+            res = vmaxq_f16(bottom_data.val[1], vmaxq_f16(bottom_data.val[0], vmaxq_f16(top_data.val[0], top_data.val[1])));
+        }
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+    },
+    input, output);
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+}
+
 template <PoolingType pooling_type>
 void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
 {
@@ -496,19 +624,29 @@ void NEPoolingLayerKernel::run(const Window &window)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    unsigned int pool_stride_x, pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y)    = _pool_info.pad_stride_info().stride();
+    const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
+    const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
 
     // Set step for input in x and y direction for the input
     Window       window_input(window);
     unsigned int window_x_inc = 0;
-    if(_input->info()->data_type() == DataType::QS8)
-    {
-        window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
-    }
-    else
+    switch(_input->info()->data_type())
     {
-        window_x_inc = pool_stride_x;
+        case DataType::QS8:
+        case DataType::F16:
+        {
+            window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+            break;
+        }
+        case DataType::F32:
+        {
+            window_x_inc = pool_stride_x;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported");
+        }
     }
     window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
     window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
diff --git a/tests/benchmark_new/NEON/PoolingLayer.cpp b/tests/benchmark_new/NEON/PoolingLayer.cpp
index baa6e31483..c9d598d4a9 100644
--- a/tests/benchmark_new/NEON/PoolingLayer.cpp
+++ b/tests/benchmark_new/NEON/PoolingLayer.cpp
@@ -39,24 +39,29 @@ namespace arm_compute
 {
 namespace test
 {
+namespace
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const auto alexnet_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F16, DataType::F32 });
+const auto lenet_data_types   = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+const auto alexnet_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F32 });
+const auto lenet_data_types   = framework::dataset::make("DataType", { DataType::F32 });
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+} // namespace
+
 using NEPoolingLayerFixture = PoolingLayerFixture<Tensor, NEPoolingLayer, Accessor>;
 
 TEST_SUITE(NEON)
 
 REGISTER_FIXTURE_DATA_TEST_CASE(AlexNetPoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL,
-                                framework::dataset::combine(framework::dataset::combine(datasets::AlexNetPoolingLayerDataset(),
-                                                                                        framework::dataset::make("DataType", { DataType::F32, DataType::QS8 })),
-                                                            framework::dataset::make("Batches", { 1, 4, 8 })));
+                                framework::dataset::combine(framework::dataset::combine(datasets::AlexNetPoolingLayerDataset(), alexnet_data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
 
 REGISTER_FIXTURE_DATA_TEST_CASE(LeNet5PoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL,
-                                framework::dataset::combine(framework::dataset::combine(datasets::LeNet5PoolingLayerDataset(),
-                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                            framework::dataset::make("Batches", { 1, 4, 8 })));
+                                framework::dataset::combine(framework::dataset::combine(datasets::LeNet5PoolingLayerDataset(), lenet_data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
 
 REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetPoolingLayer, NEPoolingLayerFixture, framework::DatasetMode::ALL,
-                                framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetPoolingLayerDataset(),
-                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                            framework::dataset::make("Batches", { 1, 4, 8 })));
+                                framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetPoolingLayerDataset(), lenet_data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
 
 TEST_SUITE_END()
 } // namespace test
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 9d6c8824ca..3961770310 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -39,8 +39,11 @@ using namespace arm_compute::test::validation;
 
 namespace
 {
-const float tolerance_q = 0;     /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
-const float tolerance_f = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */
+const float tolerance_q   = 0;     /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+const float tolerance_f32 = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const float tolerance_f16 = 0.001f; /**< Tolerance value for comparing reference's output against half precision floating point implementation's output */
+#endif                              /* ARM_COMPUTE_ENABLE_FP16 */
 
 /** Compute Neon pooling layer function.
  *
@@ -73,6 +76,7 @@ Tensor compute_pooling_layer(const TensorShape &shape_in, const TensorShape &sha
     switch(dt)
     {
         case DataType::F32:
+        case DataType::F16:
             min = -1;
             max = 1;
             break;
@@ -123,7 +127,7 @@ BOOST_DATA_TEST_CASE(RandomDataset,
     RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
 
     // Validate output
-    validate(Accessor(dst), ref_dst, tolerance_f, 0);
+    validate(Accessor(dst), ref_dst, tolerance_f32, 0);
 }
 
 BOOST_DATA_TEST_CASE(RunSmall7x7,
@@ -140,10 +144,29 @@ BOOST_DATA_TEST_CASE(RunSmall7x7,
     RawTensor ref_dst = Reference::compute_reference_pooling_layer(src_shape, dst_shape, dt, pool_info);
 
     // Validate output
-    validate(Accessor(dst), ref_dst, tolerance_f, 0);
+    validate(Accessor(dst), ref_dst, tolerance_f32, 0);
 }
 BOOST_AUTO_TEST_SUITE_END()
 
+#ifdef ARM_COMPUTE_ENABLE_FP16
+BOOST_AUTO_TEST_SUITE(Float16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RandomDataset,
+                     RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::F16),
+                     obj, dt)
+{
+    // Compute function
+    Tensor dst = compute_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f16, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
 BOOST_AUTO_TEST_SUITE(Quantized)
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
 BOOST_DATA_TEST_CASE(RandomDataset,
diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp
index c0d5a20c16..b838907320 100644
--- a/tests/validation/Reference.cpp
+++ b/tests/validation/Reference.cpp
@@ -698,6 +698,7 @@ RawTensor Reference::compute_reference_pooling_layer(const TensorShape &shape_in
     switch(dt)
     {
         case DataType::F32:
+        case DataType::F16:
             min = -1;
             max = 1;
             break;
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 67dadd6da3..b8e5a6678c 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -44,6 +44,26 @@
 #include <string>
 #include <vector>
 
+#if ARM_COMPUTE_ENABLE_FP16
+//Beware! most std templates acting on types don't work with the data type float16_t
+namespace std
+{
+template <>
+class numeric_limits<float16_t>
+{
+public:
+    static float16_t lowest()
+    {
+        return -std::numeric_limits<float>::max(); // -inf
+    };
+    static float16_t max()
+    {
+        return std::numeric_limits<float>::max(); // +inf
+    };
+};
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
 namespace arm_compute
 {
 namespace test
@@ -1476,7 +1496,7 @@ void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_in
                     {
                         for(int x = wstart; x < wend; ++x)
                         {
-                            T val = in[r * h_in * w_in + y * w_in + x];
+                            const T val = in[r * h_in * w_in + y * w_in + x];
                             if(val > max_val)
                             {
                                 max_val = val;
-- 
cgit v1.2.1