diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/convolution_layer.cl | 74 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/flatten.cl | 57 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/im2col.cl | 529 |
3 files changed, 328 insertions, 332 deletions
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl index 9335b047fe..2b75b45fe1 100644 --- a/src/core/CL/cl_kernels/convolution_layer.cl +++ b/src/core/CL/cl_kernels/convolution_layer.cl @@ -53,7 +53,7 @@ * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) */ -__kernel void reshape_to_columns_nchw( +__kernel void reshape_to_columns( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), #ifdef HAS_BIAS @@ -109,74 +109,4 @@ __kernel void reshape_to_columns_nchw( } } } - -/** This kernel reshapes the tensor's low three dimensions to single column - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] bias_ptr Pointer to the bias tensor. Same as @p src_ptr - * @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes) - * @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] depth The depth of the input tensor - * @param[in] width The width of the input tensor - * @param[in] height The height of the input tensor - * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix - */ -__kernel void reshape_to_columns_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), -#ifdef HAS_BIAS - VECTOR_DECLARATION(bias), -#endif /* HAS_BIAS */ - uint depth, uint width, uint height, uint total_filters, uint dst_stride_z) -{ - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - bool is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1)); - - __global uchar *tmp_src_ptr = src.ptr; - __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * width * dst_stride_y + get_global_id( - 0) * width * height * dst_stride_y; -#ifdef HAS_BIAS - __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes; -#endif /* HAS_BIAS */ - - if(is_last_thread) - { - for(uint i = 0; i < total_filters; ++i) - { - *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr); - -#ifdef HAS_BIAS - *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr)); - tmp_bias_ptr += bias_stride_x; -#endif /* HAS_BIAS */ - tmp_src_ptr += height * src_stride_z; - tmp_dst_ptr += dst_stride_x; - } - } - else - { - for(uint i = 0; i < total_filters; ++i) - { - *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr); - tmp_src_ptr += height * src_stride_z; - tmp_dst_ptr += dst_stride_x; - } - } -} -#endif // defined(DATA_TYPE) && defined(NUM_GROUPS)
\ No newline at end of file +#endif // defined(DATA_TYPE) diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl new file mode 100644 index 0000000000..df0f9c4886 --- /dev/null +++ b/src/core/CL/cl_kernels/flatten.cl @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) + +/** This opencl kernel flattens the first 3 dimensions of the input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void flatten( + TENSOR3D_DECLARATION(src), + VECTOR_DECLARATION(dst)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (int)SRC_WIDTH + get_global_id(2) * (int)(SRC_WIDTH * SRC_HEIGHT)) * sizeof( + DATA_TYPE); + + *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr); +} +#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl index d034b30b68..274ec20046 100644 --- a/src/core/CL/cl_kernels/im2col.cl +++ b/src/core/CL/cl_kernels/im2col.cl @@ -35,13 +35,12 @@ #error "Element size not support" #endif // ELEMENT_SIZE -#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1 +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) +/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW * - * @note This kernel computes 4 elements * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * @@ -62,16 +61,16 @@ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col1x1_stridex1_dchw( +__kernel void im2col1x1_stridex1_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor - const uint yc = get_global_id(1); // y coordinate in the convolved tensor - const uint ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor + const uint yc = get_global_id(1); // y coordinate in the convolved tensor + const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map + const uint batch = get_global_id(2) / SRC_DEPTH; // batch size // Clamp xc // The strategy clamps at "xc" as it will be a valid value for sure @@ -107,7 +106,7 @@ __kernel void im2col1x1_stridex1_dchw( *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3; #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f; *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f; @@ -116,18 +115,16 @@ __kernel void im2col1x1_stridex1_dchw( } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) +#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) -#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR)) - -#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) - -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5 +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) +#if defined(DILATION_X) && defined(DILATION_Y) +/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 @@ -151,183 +148,65 @@ __kernel void im2col1x1_stridex1_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_generic_nhwc( +__kernel void im2col_generic_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int src_stride_y_int = (int)src_stride_y; - const int src_stride_z_int = (int)src_stride_z; - const int xc = get_global_id(1); // x coordinate in the convolved tensor - const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor - const int ch = get_global_id(0); // input feature map - const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; const int yi = yc * STRIDE_Y - PAD_TOP; // Calculate output indices - const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH; + const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; + // Linearize convolution elements for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) { - const int dilated_offset_y = yk * DILATION_Y; - const int y0 = yi + dilated_offset_y; - if(y0 >= 0 && y0 < SRC_HEIGHT) + int y = yi + yk * DILATION_Y; + for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) { - int xk; - for(xk = 0; xk < KERNEL_WIDTH; xk++) + int x = xi + xk * DILATION_X; +#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); +#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) { - const int dilated_offset_x = xk * DILATION_X; - const int x0 = xi + dilated_offset_x; - if(x0 >= 0 && x0 < SRC_WIDTH) - { - *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + dilated_offset_x * src_stride_y + dilated_offset_y * src_stride_z, DATA_TYPE); - } - else - { - *((__global DATA_TYPE *)output_ptr) = PAD_VALUE; - } - output_ptr += 1 * sizeof(DATA_TYPE); + *output_ptr = PAD_VALUE; } - } - else - { - for(int xk = 0; xk < KERNEL_WIDTH; xk++) + else { - *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE; - output_ptr += 1 * dst_stride_x; + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); } +#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 } } -#ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) - { - *((__global DATA_TYPE *)output_ptr) = 1.0f; - output_ptr += 1 * dst_stride_x; - } -#endif // HAS_BIAS -} - -/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3 - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 - * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 - * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 - * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 - * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). - */ -__kernel void im2col3x3_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), - uint src_stride_w, - uint dst_stride_w) -{ - const int src_stride_y_int = (int)src_stride_y; - const int src_stride_z_int = (int)src_stride_z; - const int xc = get_global_id(1); // x coordinate in the convolved tensor - const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor - const int ch = get_global_id(0); // input feature map - const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size - - // Calculate input indices - const int xi = xc * STRIDE_X - PAD_LEFT; - const int yi = yc * STRIDE_Y - PAD_TOP; - - // Calculate output indices - const int xo = ch * 9; // 3x3 - const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; - - VEC_DATA_TYPE(DATA_TYPE, 3) - row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); - VEC_DATA_TYPE(DATA_TYPE, 3) - row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); - VEC_DATA_TYPE(DATA_TYPE, 3) - row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); - - const int3 y = (int3)yi + (int3)(0, 1, 2); - // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer. - if(y.s0 >= 0 && y.s0 < SRC_HEIGHT) - { - row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))( - PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE)); - } - - if(y.s1 >= 0 && y.s1 < SRC_HEIGHT) - { - row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))( - PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE)); - } - - if(y.s2 >= 0 && y.s2 < SRC_HEIGHT) - { - row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))( - PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE)); - } - -#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 - // Put 0 if the value is out-of-bound - const int3 x = (int3)xi + (int3)(0, 1, 2); - VEC_DATA_TYPE(COND_DATA_TYPE, 3) - cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); - row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0); - row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0); - row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0); -#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 - vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr); - *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { - *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; + *output_ptr = 1.0f; } #endif // HAS_BIAS } +#endif // defined(DILATION_X) && defined(DILATION_Y) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3 +/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 @@ -350,16 +229,16 @@ __kernel void im2col3x3_nhwc( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col3x3_dchw( +__kernel void im2col3x3_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; @@ -402,19 +281,19 @@ __kernel void im2col3x3_dchw( *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; } #endif // HAS_BIAS } -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5 +/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 @@ -437,16 +316,16 @@ __kernel void im2col3x3_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col5x5_dchw( +__kernel void im2col5x5_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; @@ -576,20 +455,20 @@ __kernel void im2col5x5_dchw( } #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)output_ptr) = 1.0f; } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) -#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11 +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) +/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * @@ -610,16 +489,16 @@ __kernel void im2col5x5_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col11x11_padx0_pady0_dchw( +__kernel void im2col11x11_padx0_pady0_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X; @@ -776,21 +655,21 @@ __kernel void im2col11x11_padx0_pady0_dchw( } #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)output_ptr) = 1.0f; } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) +#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) -#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) -/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when - * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0. +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4. * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 @@ -810,16 +689,16 @@ __kernel void im2col11x11_padx0_pady0_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_generic_padx0_pady0_dchw( +__kernel void im2col_generic_padx0_pady0_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X; @@ -855,25 +734,25 @@ __kernel void im2col_generic_padx0_pady0_dchw( } /* End of loop over KERNEL_HEIGHT */ #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *output_ptr = 1.0f; } #endif // HAS_BIAS } -#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) + +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED) -#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. +#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + +/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC * + * @note This kernel computes VECTOR_SIZE elements * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64 - * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 - * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 - * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 - * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 @@ -893,64 +772,154 @@ __kernel void im2col_generic_padx0_pady0_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_generic_dchw( +__kernel void im2col3x3_nhwc( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size // Calculate input indices - const int xi = xc * STRIDE_X - PAD_LEFT; - const int yi = yc * STRIDE_Y - PAD_TOP; + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; - // Calculate output indices - const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; - const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; - __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; - - // Linearize convolution elements - for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) - { - int y = yi + yk * DILATION_Y; - for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) - { - int x = xi + xk * DILATION_X; -#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 - *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); -#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 - if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) - { - *output_ptr = PAD_VALUE; - } - else - { - *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); - } -#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 - } - } + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int yi_coord = 0; + int3 offset = 0; + + // Clamp xi + int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT); +#if PAD_TOP != 0 || PAD_BOTTOM != 0 +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + xi_offset *= (int3)src_stride_y; + + // Out-of-bound condition for X + int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH); + + // yi == 0 + // Clamp yi + // yi_coord is casted to unsigned int in order to use just a min() operation + // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 + yi_coord = yi - (int)PAD_TOP; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT)); + values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0)); + values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1)); + values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2)); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 1 + // Clamp yi_coord (it can be negative if PAD_TOP > 1) + yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with zeros + y_cond = (int)((uint)(yi - (int)PAD_TOP + 1) >= (uint)(SRC_HEIGHT)); + values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0)); + values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1)); + values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2)); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 2 + // Clamp yi_coord + yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + y_cond = (int)((uint)(yi - (int)PAD_TOP + 2) >= (uint)(SRC_HEIGHT)); + values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0)); + values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1)); + values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2)); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // Store + VSTORE(VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH); #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if((ch + VECTOR_SIZE) >= SRC_DEPTH) { - *output_ptr = 1.0f; + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f; } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) -/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when - * the kernel width and height are the same of width and height of the input tensor +/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -958,35 +927,75 @@ __kernel void im2col_generic_dchw( * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] width The width of the input tensor - * @param[in] height The height of the input tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_reduced_dchw( +__kernel void im2col_generic_nhwc( TENSOR3D_DECLARATION(src), - VECTOR_DECLARATION(dst), - uint width, uint height) + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) { - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size - const uint image_size = width * height; + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int i = 0; + for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) + { + // Clamp yi_coord + int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP; + yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1)); - __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x; + // Out-of-bound condition for Y + int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT); - *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr); + for(int xk = 0; xk < KERNEL_WIDTH; ++xk) + { + // Clamp xi_coord + int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT); + xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1)); + + // Out-of-bound condition for X + int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); + + int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z); + + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset)); + + // Replace with PAD_VALUE if the value is out-of-bound + values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition)); + + // Store + VSTORE(VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); + + i++; + } + } #ifdef HAS_BIAS - // If it is the last thread in the 3 dimensional workgroup - if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1)) + if((ch + VECTOR_SIZE) >= SRC_DEPTH) { - tmp_out_ptr += dst_stride_x; - *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f; + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f; } #endif // HAS_BIAS } +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED) #endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) |