COMPMID-2479: Extend CLPoolingLayer max pooling to extract indices

Fix PoolingLayer max pooling reference bug to extract indices. Extend CLPoolingLayer max pooling to extract indices, all the paddings need to be substracted. Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: If8e82e7f7e03172ad05f5a7cd5f13cf682fd1ffc Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3649 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez <pablo.tello@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Sheri Zhang <sheri.zhang@arm.com> 2020-08-03 20:11:56 +0100
committer: Sheri Zhang <sheri.zhang@arm.com> 2020-08-05 15:59:59 +0000
commit: 801bbcb2f9a16633d0071b55aaf93b89870248ee (patch)
tree: 4581fd059191e89803f3d77db0b052714bc20eba
parent: 0499dff9293a86d3d53f72fed0a38b2823563674 (diff)
download: ComputeLibrary-801bbcb2f9a16633d0071b55aaf93b89870248ee.tar.gz
5 files changed, 623 insertions, 67 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 73874b69b2..9dec79b01a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -325,6 +325,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "pooling_layer_7", "pooling_layer.cl" },
     { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
     { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
+    { "pooling_layer_2_nhwc_indices_fp32", "pooling_layer.cl" },
+    { "pooling_layer_2_nhwc_indices_fp16", "pooling_layer.cl" },
+    { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
+    { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
     { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
     { "prior_box_layer_nchw", "prior_box_layer.cl" },
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 2a0e040cf1..9e6521b300 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -192,22 +192,22 @@ ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y,
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_2(
     TENSOR3D_DECLARATION(input),
@@ -256,22 +256,22 @@ __kernel void pooling_layer_2(
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_3(
     TENSOR3D_DECLARATION(input),
@@ -344,22 +344,22 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_optimized_3(
     TENSOR3D_DECLARATION(input),
@@ -402,22 +402,22 @@ __kernel void pooling_layer_optimized_3(
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_MxN_nchw(
     TENSOR3D_DECLARATION(input),
@@ -515,17 +515,17 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
  * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -534,7 +534,7 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_MxN_nhwc(
     TENSOR4D_DECLARATION(input),
@@ -572,7 +572,7 @@ __kernel void pooling_layer_MxN_nhwc(
             data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
 #else  /* defined(DST_DEPTH) */
             VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+            data0    = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
 #endif /* defined(DST_DEPTH) */
 
 #if defined(POOL_L2)
@@ -596,3 +596,443 @@ __kernel void pooling_layer_MxN_nhwc(
     // Store result
     vstore8(CONVERT(vdata, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)output.ptr);
 }
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
+
+    const int x = get_global_id(0) * STRIDE_X;
+    const int y = get_global_id(1) * STRIDE_Y;
+    const int z = get_global_id(2);
+
+    //x axis: width, y axis: height, z axis: component
+    const uint padded_offset = input->offset_first_element_in_bytes
+                               + x * input->stride_x
+                               + y * input->stride_y
+                               + z * input->stride_z;
+
+    const uint offset_base = padded_offset
+                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
+                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
+                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
+                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
+    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
+#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+    *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+
+    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+
+inline void offset_no_padding_nhwc_3D(const Tensor3D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1) * STRIDE_X;
+    const int z = get_global_id(2) * STRIDE_Y;
+
+    //x axis: component, y axis: width, z axis: height
+    const uint padded_offset = input->offset_first_element_in_bytes
+                               + x * 8 * input->stride_x
+                               + y * input->stride_y
+                               + z * input->stride_z;
+
+    const uint offset_base = padded_offset
+                             - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
+                             - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
+                             - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
+                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
+    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
+    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+
+#if defined(DST_DEPTH)
+inline void offset_no_padding_nhwc_4D(const Tensor4D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+    const int z_max     = get_global_size(2) / BATCH_SIZE;
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1) * STRIDE_X;
+    const int z = (get_global_id(2) % z_max) * STRIDE_Y;
+    const int w = get_global_id(2) / z_max;
+
+    const unsigned int padded_offset = input->offset_first_element_in_bytes
+                                       + x * 8 * input->stride_x
+                                       + y * input->stride_y
+                                       + z * input->stride_z;
+
+    const unsigned int offset_base = padded_offset
+                                     - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
+                                     - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
+                                     - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
+                                     - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
+    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
+    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+#endif //defined(DST_DEPTH)
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp32(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    float data0_max = POOL_OP(data0.s0, data0.s1);
+    float data1_max = POOL_OP(data1.s0, data1.s1);
+    float res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global float *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp16(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    half data0_max = POOL_OP(data0.s0, data0.s1);
+    half data1_max = POOL_OP(data1.s0, data1.s1);
+    half res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global half *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nhwc_indices_fp32(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output),
+    TENSOR4D_DECLARATION(indices))
+{
+    // Get pixels pointer
+#if defined(DST_DEPTH)
+    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+#endif /* defined(DST_DEPTH) */
+
+#if defined(DST_DEPTH)
+    // Load data
+    float8 data_top0    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 0, 0));
+    float8 data_top1    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 0, 0));
+    float8 data_bottom0 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 1, 0));
+    float8 data_bottom1 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 1, 0));
+#else  /* defined(DST_DEPTH) */
+    // Load data
+    float8   data_top0    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+    float8   data_top1    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float8   data_bottom0 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 1));
+    float8   data_bottom1 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 1));
+#endif /* defined(DST_DEPTH) */
+
+    float8 data_top_max    = POOL_OP(data_top0, data_top1);
+    float8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
+    float8 data_max        = POOL_OP(data_top_max, data_bottom_max);
+    vstore8(data_max, 0, (__global float *)output.ptr);
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_x0 = 0;
+    uint offset_x1 = 0;
+    uint offset_x2 = 0;
+    uint offset_x3 = 0;
+
+#if defined(DST_DEPTH)
+    offset_no_padding_nhwc_4D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
+#else  /* defined(DST_DEPTH) */
+    offset_no_padding_nhwc_3D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
+#endif /* defined(DST_DEPTH) */
+
+    uint8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
+    uint8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
+    uint8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
+    uint8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+
+    uint8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
+    uint8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
+    uint8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
+    vstore8(index, 0, (__global uint *)indices.ptr);
+
+#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nhwc_indices_fp16(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output),
+    TENSOR4D_DECLARATION(indices))
+{
+    // Get pixels pointer
+#if defined(DST_DEPTH)
+    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D input        = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output       = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices      = CONVERT_TO_TENSOR3D_STRUCT(indices);
+#endif /* defined(DST_DEPTH) */
+
+#if defined(DST_DEPTH)
+    // Load data
+    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 0, 0));
+    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 0, 0));
+    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 1, 0));
+    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 1, 0));
+#else  /* defined(DST_DEPTH) */
+    // Load data
+    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 1));
+    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 1));
+#endif /* defined(DST_DEPTH) */
+
+    half8 data_top_max    = POOL_OP(data_top0, data_top1);
+    half8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
+    half8 data_max        = POOL_OP(data_top_max, data_bottom_max);
+    vstore8(data_max, 0, (__global half *)output.ptr);
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_x0_int = 0;
+    uint offset_x1_int = 0;
+    uint offset_x2_int = 0;
+    uint offset_x3_int = 0;
+
+#if defined(DST_DEPTH)
+    offset_no_padding_nhwc_4D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
+#else  /* defined(DST_DEPTH) */
+    offset_no_padding_nhwc_3D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
+#endif /* defined(DST_DEPTH) */
+
+    ushort offset_x0 = (ushort)offset_x0_int;
+    ushort offset_x1 = (ushort)offset_x1_int;
+    ushort offset_x2 = (ushort)offset_x2_int;
+    ushort offset_x3 = (ushort)offset_x3_int;
+
+    ushort8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
+    ushort8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
+    ushort8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
+    ushort8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+
+    ushort8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
+    ushort8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
+    ushort8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
+    vstore8(CONVERT(index, uint8), 0, (__global uint *)indices.ptr);
+
+#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
+}
+\ No newline at end of file
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index bdc88a4f86..d60e196b7f 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -60,13 +60,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in the CL backend.");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
                                     "Unsupported combination of parameters!");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
                                     && (input->data_layout() == DataLayout::NHWC),
                                     "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+    // Check indices
+    if(indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+    }
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -80,7 +87,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
+std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -140,7 +147,19 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
             AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
                                                pool_stride_x, pool_stride_y);
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access);
+
+            // Update indices window
+            if(indices)
+            {
+                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
+            }
+            else
+            {
+                window_changed = update_window_and_padding(win, input_access, output_access);
+            }
+
             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
             break;
         }
@@ -153,7 +172,19 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
                                             0, -1,
                                             ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access);
+
+            // Update indices window
+            if(indices)
+            {
+                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
+            }
+            else
+            {
+                window_changed = update_window_and_padding(win, input_access, output_access);
+            }
+
             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
             break;
         }
@@ -207,8 +238,39 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
 
     // Set build options
     CLBuildOptions build_opts;
+    const DataType data_type = input->info()->data_type();
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, (indices ? indices->info() : nullptr));
+
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    if(_data_layout == DataLayout::NCHW)
+    {
+        CLPoolingConfig pooling_config     = std::get<2>(win_config);
+        _num_elems_processed_per_iteration = pooling_config.first;
+        _border_size                       = pooling_config.second;
+    }
+    else
+    {
+        _border_size                       = BorderSize(1, 0, 0, 0);
+        _num_elems_processed_per_iteration = 8;
+    }
 
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    // Tensor paddings are used to calculate the indicies for MAX pooling
+    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+    {
+        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(input->info()->padding().left));
+        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(input->info()->padding().right));
+        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(input->info()->padding().top));
+        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(input->info()->padding().bottom));
+        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
+        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+    }
+
+    if(is_data_type_quantized_asymmetric(data_type) && input->info()->quantization_info() != output->info()->quantization_info())
     {
         const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
@@ -223,8 +285,6 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     auto_init(input->info(), output->info(), pool_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
 
-    const DataType data_type = input->info()->data_type();
-
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
     build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
@@ -282,6 +342,20 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
                                           + support::cpp11::to_string(pool_size_x);
                 _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
             }
+            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+            {
+                // For max pooling with pool2x2, store indicies which will be used in max unpooling
+                if(data_type == DataType::F32)
+                {
+                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+                else if(data_type == DataType::F16)
+                {
+                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+            }
             else // Run general case
             {
                 std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
@@ -296,32 +370,33 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
             build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
             build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
                                      "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
-            std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
-            _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
+                                     "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3)));
+
+            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+            {
+                if(data_type == DataType::F32)
+                {
+                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+                else if(data_type == DataType::F16)
+                {
+                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+            }
+            else
+            {
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        CLPoolingConfig pooling_config     = std::get<2>(win_config);
-        _num_elems_processed_per_iteration = pooling_config.first;
-        _border_size                       = pooling_config.second;
-    }
-    else
-    {
-        _border_size                       = BorderSize(1, 0, 0, 0);
-        _num_elems_processed_per_iteration = 8;
-    }
-
     // Set config_id for enabling LWS tuning
     _config_id = "pooling_layer_";
     _config_id += lower_string(string_from_data_type(data_type));
@@ -377,6 +452,10 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, slice);
+                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_3D_tensor_argument(idx, _indices, slice);
+                }
                 enqueue(queue, *this, slice, lws_hint());
             }
             while(window_collapsed.slide_window_slice_3D(slice));
@@ -398,6 +477,10 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 unsigned int idx = 0;
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, slice);
+                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_4D_tensor_argument(idx, _indices, slice);
+                }
                 enqueue(queue, *this, slice, lws_hint());
             }
             while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index b4ba38e9aa..eefad4ab2c 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -71,6 +71,12 @@ framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(5, 7) })),
 framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
 framework::dataset::make("ExcludePadding", { true }));
 
+const auto PoolingLayerDatasetFPIndicesSmall = combine(combine(combine(framework::dataset::make("PoolingType",
+{ PoolingType::MAX }),
+framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 0, 0) })),
+framework::dataset::make("ExcludePadding", { true, false }));
+
 constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f);  /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
 constexpr AbsoluteTolerance<float>   tolerance_f16(0.01f);   /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
@@ -134,6 +140,9 @@ using CLSpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<CLTens
 template <typename T>
 using CLMixedPrecesionPoolingLayerFixture = PoolingLayerValidationMixedPrecisionFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
 
+template <typename T>
+using CLPoolingLayerIndicesFixture = PoolingLayerIndicesValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSpecial, CLSpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
@@ -157,6 +166,17 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPIndicesSmall,
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F32))),
+                                                                                                                        pool_data_layout_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target_indices), _ref_indices);
+}
+
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -176,6 +196,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLMixedPrecesionPoolingLayerFixture<half>, fram
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPIndicesSmall,
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::F16))),
+                                                                                                                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target_indices), _ref_indices);
+}
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 35ce27c9a4..5f4edfe49c 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -215,7 +215,7 @@ SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLay
         return pooling_layer_internal<half, float>(src, info, indices, data_layout);
     }
 
-    return pooling_layer_internal<half>(src, info, indices);
+    return pooling_layer_internal<half>(src, info, indices, data_layout);
 }
 
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices, DataLayout data_layout);
author	Sheri Zhang <sheri.zhang@arm.com>	2020-08-03 20:11:56 +0100
committer	Sheri Zhang <sheri.zhang@arm.com>	2020-08-05 15:59:59 +0000
commit	801bbcb2f9a16633d0071b55aaf93b89870248ee (patch)
tree	4581fd059191e89803f3d77db0b052714bc20eba
parent	0499dff9293a86d3d53f72fed0a38b2823563674 (diff)
download	ComputeLibrary-801bbcb2f9a16633d0071b55aaf93b89870248ee.tar.gz