aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/nchw/pooling_layer.cl
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2021-09-01 14:05:00 +0100
committerGiorgio Arena <giorgio.arena@arm.com>2021-09-03 14:04:19 +0000
commit8fce496a715929372b3c448a233713d87d65f768 (patch)
tree283841880dd0c969addda1c08f50fc6e622ff07d /src/core/CL/cl_kernels/nchw/pooling_layer.cl
parentb8025b3bb1b75fa94400a665e65a1d53ba9965f9 (diff)
downloadComputeLibrary-8fce496a715929372b3c448a233713d87d65f768.tar.gz
Remove padding from ClPool2dKernel NCHW
- Simplify NCHW kernel structure by removing old optimized paths - Merge quantized with fp kernels Resolve COMPMID-4722 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: I79016b119619aed6a6193295601cd6517f14b88c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6183 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/nchw/pooling_layer.cl')
-rw-r--r--src/core/CL/cl_kernels/nchw/pooling_layer.cl328
1 files changed, 141 insertions, 187 deletions
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
index 790ddb381a..15ad116289 100644
--- a/src/core/CL/cl_kernels/nchw/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -22,13 +22,15 @@
* SOFTWARE.
*/
#include "helpers.h"
-#include "repeat.h"
-#include "tile_helpers.h"
#if defined(POOL_AVG) || defined(POOL_L2)
#define POOL_OP(x, y) ((x) + (y))
#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#if defined(QUANTIZED)
+#define POOL_OP(x, y) (max((x), (y)))
+#else // defined(QUANTIZED)
#define POOL_OP(x, y) (fmax((x), (y)))
+#endif // defined(QUANTIZED)
#endif /* defined(POOL_AVG) || defined(POOL_L2) */
#if defined(POOL_L2)
@@ -40,13 +42,12 @@
#define DIV_OP(x, y) (x * (1.f / y))
#define SQRT_OP(x) sqrt((x))
-#if defined(FP_MIXED_PRECISION)
+#if defined(FP_MIXED_PRECISION) || defined(QUANTIZED)
#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
- CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
-#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) || defined(QUANTIZED)*/
#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
-#endif /* defined(FP_MIXED_PRECISION) */
+#endif /* defined(FP_MIXED_PRECISION) || defined(QUANTIZED)*/
ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
@@ -66,7 +67,7 @@ ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y,
/** Performs a pooling function of pool size equal to N (NCHW)
*
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32/QASYMM8;
* @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
* @note In case of average pooling the following information must be passed at compile time:
* -DPOOL_AVG must be provided otherwise max pooling will be performed.
@@ -75,59 +76,93 @@ ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y,
* -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
* @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32/QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void pooling_layer_MxN_nchw(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ int id0 = get_global_id(0);
+ int id1 = get_global_id(1);
+ int id2 = get_global_id(2);
+
+ int x_coords = (id0 * STRIDE_X) - PAD_X;
+ int y_coords = (id1 * STRIDE_Y) - PAD_Y;
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + y_coords * (int)src_stride_y + id2 * src_stride_z;
VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
vdata = INITIAL_VALUE;
ACC_DATA_TYPE sdata = INITIAL_VALUE;
+ const int end_x = min((int)POOL_SIZE_X, (int)(SRC_WIDTH - x_coords));
+ const int end_y = min((int)POOL_SIZE_Y, (int)(SRC_HEIGHT - y_coords));
+
// Load data
- for(int y = 0; y < POOL_SIZE_Y; y++)
+ for(int y = 0; y < end_y; ++y)
{
- int x = 0;
- for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+ if((y_coords + y) >= 0)
{
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
- data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int x = 0;
+ for(; x <= (end_x - 8); x += 8)
+ {
+ int8 src_x = (int8)(x_coords + x) + VEC_OFFS(int, 8);
+#if defined(POOL_AVG) || defined(POOL_L2)
+ SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ cond_x = CONVERT(src_x < 0, SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, 8));
+ src_x = clamp(src_x, (int8)0, (int8)(SRC_WIDTH - 1));
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ data0 = select(VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y)), (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))0, REVERSE(cond_x, 8));
+#else // defined(POOL_AVG) || defined(POOL_L2)
+ src_x = clamp(src_x, 0, SRC_WIDTH - 1);
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y));
+#endif // defined(POOL_AVG) || defined(POOL_L2
+
#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 *= data0;
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
#endif /* defined(POOL_L2) */
- vdata = POOL_OP(vdata, data0);
- }
- // Leftover
- for(; x < (int)POOL_SIZE_X; ++x)
- {
- ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < end_x; ++x)
+ {
+ int src_x = x_coords + x;
+#if defined(POOL_AVG) || defined(POOL_L2)
+ SELECT_DATA_TYPE(ACC_DATA_TYPE)
+ cond_x = (src_x < 0);
+ src_x = clamp(src_x, 0, SRC_WIDTH - 1);
+ ACC_DATA_TYPE data0 = select((ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x * sizeof(DATA_TYPE) + y * src_stride_y))), (ACC_DATA_TYPE)0, cond_x);
+#else // defined(POOL_AVG) || defined(POOL_L2)
+ src_x = clamp(src_x, 0, SRC_WIDTH - 1);
+ ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x * sizeof(DATA_TYPE) + y * src_stride_y)));
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 *= data0;
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
#endif /* defined(POOL_L2) */
- sdata = POOL_OP(sdata, data0);
+
+ sdata = POOL_OP(sdata, data0);
+ }
}
}
@@ -144,76 +179,61 @@ __kernel void pooling_layer_MxN_nchw(
res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+#if defined(QUANTIZED)
+
+ DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+ const float result_f32 = convert_float(result_q8);
+ const float input_offset = (float)OFFSET_IN1;
+ const float input_scale = (float)SCALE_IN1;
+ const float scale_out = (float)SCALE_OUT;
+ const float offset_out = (float)OFFSET_OUT;
+ const float in_f32 = (result_f32 - input_offset) * input_scale;
+ const float out_f32 = in_f32 / scale_out + offset_out;
+ result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+ *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = result_q8;
+
+#else // defined(QUANTIZED)
+
#if defined(POOL_L2)
// Take square root of the result in L2 pooling
res = SQRT_OP(res);
#endif /* defined(POOL_L2) */
// Store result
- *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+ *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = (DATA_TYPE)res;
+#endif // defined(QUANTIZED)
}
#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
-{
- const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
- const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
-
- const int x = get_global_id(0) * STRIDE_X;
- const int y = get_global_id(1) * STRIDE_Y;
- const int z = get_global_id(2);
-
- //x axis: width, y axis: height, z axis: component
- const uint padded_offset = input->offset_first_element_in_bytes
- + x * input->stride_x
- + y * input->stride_y
- + z * input->stride_z;
-
- const uint offset_base = padded_offset
- - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */
- - PAD_TENSOR_TOP * input->stride_y /* top padding */
- - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
- - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
- *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
-#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
- *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
-#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-
- *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
- return;
-}
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
*
* @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
* @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
* @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
* @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
* @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
* @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
@@ -223,109 +243,43 @@ inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint
* @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
*/
-__kernel void pooling_layer_2_nchw_indices_fp32(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
+__kernel void pooling_layer_2_nchw_indices(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
TENSOR3D_DECLARATION(indices))
{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
- // Load data
- float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
- float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-
- // Perform calculations
- float data0_max = POOL_OP(data0.s0, data0.s1);
- float data1_max = POOL_OP(data1.s0, data1.s1);
- float res = POOL_OP(data0_max, data1_max);
- // Store result
- *(__global float *)output.ptr = res;
+ int id0 = get_global_id(0);
+ int id1 = get_global_id(1);
+ int id2 = get_global_id(2);
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+ int2 x_coords = clamp((int2)((id0 * STRIDE_X) - PAD_X), (int2)0, (int2)(SRC_WIDTH - 1));
+ int2 y_coords = clamp((int2)((id1 * STRIDE_Y) - PAD_Y) + VEC_OFFS(int, 2), (int2)0, (int2)(SRC_HEIGHT - 1));
- uint offset_top = 0;
- uint offset_bottom = 0;
-
- offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
- uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
- uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
- uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
- *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
- * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
- * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
- * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
- * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp16(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- TENSOR3D_DECLARATION(indices))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + id2 * src_stride_z;
// Load data
- half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
- half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ data0 = VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s0 * sizeof(DATA_TYPE) + y_coords.s0 * (int)src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ data1 = VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s1 * sizeof(DATA_TYPE) + y_coords.s1 * (int)src_stride_y));
// Perform calculations
- half data0_max = POOL_OP(data0.s0, data0.s1);
- half data1_max = POOL_OP(data1.s0, data1.s1);
- half res = POOL_OP(data0_max, data1_max);
+ DATA_TYPE data0_max = POOL_OP(data0.s0, data0.s1);
+ DATA_TYPE data1_max = POOL_OP(data1.s0, data1.s1);
+ DATA_TYPE res = POOL_OP(data0_max, data1_max);
// Store result
- *(__global half *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+ *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 * sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = res;
- uint offset_top = 0;
- uint offset_bottom = 0;
+#if defined(SRC_BATCH)
- offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+ uint offset_top = (x_coords.s0 + y_coords.s0 * SRC_WIDTH + id2 * (SRC_WIDTH * SRC_HEIGHT)) % SRC_BATCH;
+ uint offset_bottom = offset_top + SRC_WIDTH;
uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
- *(__global uint *)indices.ptr = index;
+ *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 * sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index;
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+#endif // defined(SRC_BATCH)
} \ No newline at end of file