From 4a626a7d52e9c4759bdc16b65401a53779dd975f Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 4 Apr 2018 10:01:14 +0100 Subject: COMPMID-801: NHWC support in CLIm2Col. And extended tests coverage adding kernel shapes 3x1, 1x5 and 7x7 Change-Id: Ia7c1d4da2368d5f5fbc1a41187f4ac1aca5f150f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127727 Tested-by: Jenkins Reviewed-by: Gian Marco Iodice --- src/core/CL/cl_kernels/im2col.cl | 202 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 201 insertions(+), 1 deletion(-) (limited to 'src/core/CL/cl_kernels') diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl index 1e85e1b303..f53ce21d05 100644 --- a/src/core/CL/cl_kernels/im2col.cl +++ b/src/core/CL/cl_kernels/im2col.cl @@ -123,7 +123,207 @@ __kernel void im2col1x1_stridex1_dchw( } #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) +#define PTR_TO_VALUE(PTR, DATA_TYPE) *((DATA_TYPE *)(PTR)) + #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) + +/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5 + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col_generic_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int src_stride_y_int = (int)src_stride_y; + const int src_stride_z_int = (int)src_stride_z; + const int xc = get_global_id(1); // x coordinate in the convolved tensor + const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor + const int ch = get_global_id(0); // input feature map + const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X - PAD_LEFT; + const int yi = yc * STRIDE_Y - PAD_TOP; + + // Calculate output indices + const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH; + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; + + for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) + { + const int y0 = yi + yk; + if(y0 >= 0 && y0 < SRC_HEIGHT) + { + int xk; + for(xk = 0; xk < KERNEL_WIDTH; xk++) + { + const int x0 = xi + xk; + if(x0 >= 0 && x0 < SRC_WIDTH) + { + *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + xk * src_stride_y + yk * src_stride_z, DATA_TYPE); + } + else + { + *((__global DATA_TYPE *)output_ptr) = PAD_VALUE; + } + output_ptr += 1 * sizeof(DATA_TYPE); + } + } + else + { + for(int xk = 0; xk < KERNEL_WIDTH; xk++) + { + *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE; + output_ptr += 1 * dst_stride_x; + } + } + } +#ifdef HAS_BIAS + if(ch == (KERNEL_DEPTH - 1)) + { + *((__global DATA_TYPE *)output_ptr) = 1.0f; + output_ptr += 1 * dst_stride_x; + } +#endif // HAS_BIAS +} + +/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3 + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col3x3_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int src_stride_y_int = (int)src_stride_y; + const int src_stride_z_int = (int)src_stride_z; + const int xc = get_global_id(1); // x coordinate in the convolved tensor + const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor + const int ch = get_global_id(0); // input feature map + const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X - PAD_LEFT; + const int yi = yc * STRIDE_Y - PAD_TOP; + + // Calculate output indices + const int xo = ch * 9; // 3x3 + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; + + VEC_DATA_TYPE(DATA_TYPE, 3) + row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); + VEC_DATA_TYPE(DATA_TYPE, 3) + row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); + VEC_DATA_TYPE(DATA_TYPE, 3) + row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); + + const int3 y = (int3)yi + (int3)(0, 1, 2); + // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer. + if(y.s0 >= 0 && y.s0 < SRC_HEIGHT) + { + row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))( + PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE), + PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE), + PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE)); + } + + if(y.s1 >= 0 && y.s1 < SRC_HEIGHT) + { + row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))( + PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE), + PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE), + PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE)); + } + + if(y.s2 >= 0 && y.s2 < SRC_HEIGHT) + { + row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))( + PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE), + PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE), + PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE)); + } + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + // Put 0 if the value is out-of-bound + const int3 x = (int3)xi + (int3)(0, 1, 2); + VEC_DATA_TYPE(COND_DATA_TYPE, 3) + cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); + row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0); + row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0); + row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr); + *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; + +#ifdef HAS_BIAS + if(ch == (KERNEL_DEPTH - 1)) + { + *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; + } +#endif // HAS_BIAS +} + /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3 * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float @@ -804,4 +1004,4 @@ __kernel void im2col_reduced_dchw( } #endif // HAS_BIAS } -#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) \ No newline at end of file +#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) -- cgit v1.2.1