diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 4 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/pooling_layer.cl | 518 | ||||
-rw-r--r-- | src/core/CL/kernels/CLPoolingLayerKernel.cpp | 137 |
3 files changed, 593 insertions, 66 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 73874b69b2..9dec79b01a 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -325,6 +325,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "pooling_layer_7", "pooling_layer.cl" }, { "pooling_layer_MxN_nchw", "pooling_layer.cl" }, { "pooling_layer_MxN_nhwc", "pooling_layer.cl" }, + { "pooling_layer_2_nhwc_indices_fp32", "pooling_layer.cl" }, + { "pooling_layer_2_nhwc_indices_fp16", "pooling_layer.cl" }, + { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" }, + { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" }, { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" }, { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" }, { "prior_box_layer_nchw", "prior_box_layer.cl" }, diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index 2a0e040cf1..9e6521b300 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -192,22 +192,22 @@ ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void pooling_layer_2( TENSOR3D_DECLARATION(input), @@ -256,22 +256,22 @@ __kernel void pooling_layer_2( * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void pooling_layer_3( TENSOR3D_DECLARATION(input), @@ -344,22 +344,22 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void pooling_layer_optimized_3( TENSOR3D_DECLARATION(input), @@ -402,22 +402,22 @@ __kernel void pooling_layer_optimized_3( * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void pooling_layer_MxN_nchw( TENSOR3D_DECLARATION(input), @@ -515,17 +515,17 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz * -DPOOL_AVG must be provided otherwise max pooling will be performed. * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) @@ -534,7 +534,7 @@ ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_siz * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ __kernel void pooling_layer_MxN_nhwc( TENSOR4D_DECLARATION(input), @@ -572,7 +572,7 @@ __kernel void pooling_layer_MxN_nhwc( data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0)); #else /* defined(DST_DEPTH) */ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) - data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y)); + data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y)); #endif /* defined(DST_DEPTH) */ #if defined(POOL_L2) @@ -596,3 +596,443 @@ __kernel void pooling_layer_MxN_nhwc( // Store result vstore8(CONVERT(vdata, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)output.ptr); } + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + +inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom) +{ + const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT; + const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM; + + const int x = get_global_id(0) * STRIDE_X; + const int y = get_global_id(1) * STRIDE_Y; + const int z = get_global_id(2); + + //x axis: width, y axis: height, z axis: component + const uint padded_offset = input->offset_first_element_in_bytes + + x * input->stride_x + + y * input->stride_y + + z * input->stride_z; + + const uint offset_base = padded_offset + - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */ + - PAD_TENSOR_TOP * input->stride_y /* top padding */ + - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */ + - PAD_TENSOR_LEFT * sizeof(DATA_TYPE); + +#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) + *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT)); +#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ + *offset_top = (uint)(offset_base / sizeof(DATA_TYPE)); +#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ + + *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + + return; +} + +inline void offset_no_padding_nhwc_3D(const Tensor3D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3) +{ + const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT; + + const int x = get_global_id(0); + const int y = get_global_id(1) * STRIDE_X; + const int z = get_global_id(2) * STRIDE_Y; + + //x axis: component, y axis: width, z axis: height + const uint padded_offset = input->offset_first_element_in_bytes + + x * 8 * input->stride_x + + y * input->stride_y + + z * input->stride_z; + + const uint offset_base = padded_offset + - (z + 1) * PAD_TENSOR_TOP * input->stride_y /* Top padding for each z plane */ + - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */ + - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */ + - PAD_TENSOR_LEFT * sizeof(DATA_TYPE); + + *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE); + *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE); + *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + + return; +} + +#if defined(DST_DEPTH) +inline void offset_no_padding_nhwc_4D(const Tensor4D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3) +{ + const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT; + const int z_max = get_global_size(2) / BATCH_SIZE; + + const int x = get_global_id(0); + const int y = get_global_id(1) * STRIDE_X; + const int z = (get_global_id(2) % z_max) * STRIDE_Y; + const int w = get_global_id(2) / z_max; + + const unsigned int padded_offset = input->offset_first_element_in_bytes + + x * 8 * input->stride_x + + y * input->stride_y + + z * input->stride_z; + + const unsigned int offset_base = padded_offset + - (z + 1) * PAD_TENSOR_TOP * input->stride_y /* Top padding for each z plane */ + - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */ + - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */ + - PAD_TENSOR_LEFT * sizeof(DATA_TYPE); + + *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE); + *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE); + *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + + return; +} +#endif //defined(DST_DEPTH) + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nchw_indices_fp32( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + TENSOR3D_DECLARATION(indices)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); + + // Load data + float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0)); + float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + float data0_max = POOL_OP(data0.s0, data0.s1); + float data1_max = POOL_OP(data1.s0, data1.s1); + float res = POOL_OP(data0_max, data1_max); + // Store result + *(__global float *)output.ptr = res; + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_top = 0; + uint offset_bottom = 0; + + offset_no_padding_nchw(&input, &offset_top, &offset_bottom); + + uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); + uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); + uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); + + *(__global uint *)indices.ptr = index; + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) +} + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nchw_indices_fp16( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + TENSOR3D_DECLARATION(indices)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); + + // Load data + half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0)); + half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + half data0_max = POOL_OP(data0.s0, data0.s1); + half data1_max = POOL_OP(data1.s0, data1.s1); + half res = POOL_OP(data0_max, data1_max); + // Store result + *(__global half *)output.ptr = res; + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_top = 0; + uint offset_bottom = 0; + + offset_no_padding_nchw(&input, &offset_top, &offset_bottom); + + uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); + uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); + uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); + + *(__global uint *)indices.ptr = index; + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) +} + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_stride_w Stride of the indices tensor in W dimension (in bytes) + * @param[in] indices_step_w indices_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nhwc_indices_fp32( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + TENSOR4D_DECLARATION(indices)) +{ + // Get pixels pointer +#if defined(DST_DEPTH) + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH); + Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH); +#else /* defined(DST_DEPTH) */ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); +#endif /* defined(DST_DEPTH) */ + +#if defined(DST_DEPTH) + // Load data + float8 data_top0 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 0, 0)); + float8 data_top1 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 0, 0)); + float8 data_bottom0 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 1, 0)); + float8 data_bottom1 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 1, 0)); +#else /* defined(DST_DEPTH) */ + // Load data + float8 data_top0 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0)); + float8 data_top1 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0)); + float8 data_bottom0 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 1)); + float8 data_bottom1 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 1)); +#endif /* defined(DST_DEPTH) */ + + float8 data_top_max = POOL_OP(data_top0, data_top1); + float8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1); + float8 data_max = POOL_OP(data_top_max, data_bottom_max); + vstore8(data_max, 0, (__global float *)output.ptr); + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_x0 = 0; + uint offset_x1 = 0; + uint offset_x2 = 0; + uint offset_x3 = 0; + +#if defined(DST_DEPTH) + offset_no_padding_nhwc_4D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3); +#else /* defined(DST_DEPTH) */ + offset_no_padding_nhwc_3D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3); +#endif /* defined(DST_DEPTH) */ + + uint8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 }; + uint8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 }; + uint8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 }; + uint8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 }; + + uint8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1)); + uint8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1)); + uint8 index = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max)); + vstore8(index, 0, (__global uint *)indices.ptr); + +#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */ +} + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_stride_w Stride of the indices tensor in W dimension (in bytes) + * @param[in] indices_step_w indices_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nhwc_indices_fp16( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + TENSOR4D_DECLARATION(indices)) +{ + // Get pixels pointer +#if defined(DST_DEPTH) + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH); + Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH); +#else /* defined(DST_DEPTH) */ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); +#endif /* defined(DST_DEPTH) */ + +#if defined(DST_DEPTH) + // Load data + half8 data_top0 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 0, 0)); + half8 data_top1 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 0, 0)); + half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 1, 0)); + half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 1, 0)); +#else /* defined(DST_DEPTH) */ + // Load data + half8 data_top0 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0)); + half8 data_top1 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0)); + half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 1)); + half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 1)); +#endif /* defined(DST_DEPTH) */ + + half8 data_top_max = POOL_OP(data_top0, data_top1); + half8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1); + half8 data_max = POOL_OP(data_top_max, data_bottom_max); + vstore8(data_max, 0, (__global half *)output.ptr); + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_x0_int = 0; + uint offset_x1_int = 0; + uint offset_x2_int = 0; + uint offset_x3_int = 0; + +#if defined(DST_DEPTH) + offset_no_padding_nhwc_4D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int); +#else /* defined(DST_DEPTH) */ + offset_no_padding_nhwc_3D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int); +#endif /* defined(DST_DEPTH) */ + + ushort offset_x0 = (ushort)offset_x0_int; + ushort offset_x1 = (ushort)offset_x1_int; + ushort offset_x2 = (ushort)offset_x2_int; + ushort offset_x3 = (ushort)offset_x3_int; + + ushort8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 }; + ushort8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 }; + ushort8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 }; + ushort8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 }; + + ushort8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1)); + ushort8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1)); + ushort8 index = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max)); + vstore8(CONVERT(index, uint8), 0, (__global uint *)indices.ptr); + +#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */ +}
\ No newline at end of file diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index bdc88a4f86..d60e196b7f 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -60,13 +60,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in the CL backend."); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2), "Unsupported combination of parameters!"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() && (input->data_layout() == DataLayout::NHWC), "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); + // Check indices + if(indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + } // Checks performed when output is configured if(output->total_size() != 0) @@ -80,7 +87,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info) +std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -140,7 +147,19 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y, pool_stride_x, pool_stride_y); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access); + + // Update indices window + if(indices) + { + AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access, indices_access); + indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, input_access, output_access); + } + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); break; } @@ -153,7 +172,19 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso 0, -1, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1)); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access); + + // Update indices window + if(indices) + { + AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access, indices_access); + indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, input_access, output_access); + } + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); break; } @@ -207,8 +238,39 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co // Set build options CLBuildOptions build_opts; + const DataType data_type = input->info()->data_type(); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, (indices ? indices->info() : nullptr)); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); + + if(_data_layout == DataLayout::NCHW) + { + CLPoolingConfig pooling_config = std::get<2>(win_config); + _num_elems_processed_per_iteration = pooling_config.first; + _border_size = pooling_config.second; + } + else + { + _border_size = BorderSize(1, 0, 0, 0); + _num_elems_processed_per_iteration = 8; + } - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + // Tensor paddings are used to calculate the indicies for MAX pooling + if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type)) + { + build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(input->info()->padding().left)); + build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(input->info()->padding().right)); + build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(input->info()->padding().top)); + build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(input->info()->padding().bottom)); + build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(input->info()->dimension(idx_channel))); + build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width))); + build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height))); + } + + if(is_data_type_quantized_asymmetric(data_type) && input->info()->quantization_info() != output->info()->quantization_info()) { const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); @@ -223,8 +285,6 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co auto_init(input->info(), output->info(), pool_info); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr)); - const DataType data_type = input->info()->data_type(); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type)); build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)); @@ -282,6 +342,20 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co + support::cpp11::to_string(pool_size_x); _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); } + else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type)) + { + // For max pooling with pool2x2, store indicies which will be used in max unpooling + if(data_type == DataType::F32) + { + std::string kernel_name = "pooling_layer_2_nchw_indices_fp32"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else if(data_type == DataType::F16) + { + std::string kernel_name = "pooling_layer_2_nchw_indices_fp16"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + } else // Run general case { std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw"; @@ -296,32 +370,33 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height))); build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height))); - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1, + "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3))); + + if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type)) + { + if(data_type == DataType::F32) + { + std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else if(data_type == DataType::F16) + { + std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + } + else + { + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } break; } default: ARM_COMPUTE_ERROR("Not implemented"); } - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - ICLKernel::configure_internal(std::get<1>(win_config)); - - if(_data_layout == DataLayout::NCHW) - { - CLPoolingConfig pooling_config = std::get<2>(win_config); - _num_elems_processed_per_iteration = pooling_config.first; - _border_size = pooling_config.second; - } - else - { - _border_size = BorderSize(1, 0, 0, 0); - _num_elems_processed_per_iteration = 8; - } - // Set config_id for enabling LWS tuning _config_id = "pooling_layer_"; _config_id += lower_string(string_from_data_type(data_type)); @@ -377,6 +452,10 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, slice); + if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + { + add_3D_tensor_argument(idx, _indices, slice); + } enqueue(queue, *this, slice, lws_hint()); } while(window_collapsed.slide_window_slice_3D(slice)); @@ -398,6 +477,10 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, slice); + if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + { + add_4D_tensor_argument(idx, _indices, slice); + } enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); |