From 54f18c4a7a20ff697dc1ba66a73e9d622a407d02 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Wed, 7 Feb 2018 23:13:06 +0000 Subject: COMPMID-901 - Optimizing CLCol2ImKernel This patch makes col2im on OpenCL 2 times faster Change-Id: I8d90f5a72a050355ca1fd13433d8c2c26e5e33f5 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/119442 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- src/core/CL/cl_kernels/col2im.cl | 84 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 4 deletions(-) (limited to 'src/core/CL/cl_kernels/col2im.cl') diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl index 58fb80a416..9b5a7b5b7e 100644 --- a/src/core/CL/cl_kernels/col2im.cl +++ b/src/core/CL/cl_kernels/col2im.cl @@ -27,13 +27,88 @@ #include "fixed_point.h" #endif // FIXED_POINT_POSITION -#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) -/** This kernel performs a reshaping of the output of the convolution layer. +#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) +#if !defined(FIXED_POINT_POSITION) + +#if ELEMENT_SIZE == 1 +#define COND_DATA_TYPE char +#elif ELEMENT_SIZE == 2 +#define COND_DATA_TYPE short +#elif ELEMENT_SIZE == 4 +#define COND_DATA_TYPE int +#else // ELEMENT_SIZE +#error "Element size not support" +#endif // ELEMENT_SIZE + +/** This kernel performs a reshaping of the output of the convolution layer * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320 + * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600 + * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +__kernel void col2im( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + uint dst_stride_w) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + VEC_DATA_TYPE(DATA_TYPE, 8) + data = vload8(0, (__global DATA_TYPE *)src.ptr); + + uint x = get_global_id(0) * 8; + uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7); + + VEC_DATA_TYPE(COND_DATA_TYPE, 8) + cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8)); + + // Clamp x if out-of-bounds + x_clamped = select((uint8)x, x_clamped, convert_int8(cond0)); + + // If out-of-bound, overwrite with the first element + data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0); + + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes; + + // Compute output offset + int idx = (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w; + + // Store value + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s1 * dst_stride_z)) = data.s1; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s2 * dst_stride_z)) = data.s2; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s3 * dst_stride_z)) = data.s3; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s4 * dst_stride_z)) = data.s4; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s5 * dst_stride_z)) = data.s5; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6; + *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7; +} +#else // !defined(FIXED_POINT_POSITION) +/** This kernel performs a reshaping of the output of the convolution layer. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=qs8 * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=320 * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32 + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -65,4 +140,5 @@ __kernel void col2im( // Store value *((__global DATA_TYPE *)(dst.ptr + idx)) = *((__global DATA_TYPE *)(src.ptr)); } -#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) \ No newline at end of file +#endif // !defined(FIXED_POINT_POSITION) +#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) \ No newline at end of file -- cgit v1.2.1