From 9fe414430c3c989b1cdc79d41e031495aed2cb7c Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Wed, 23 Aug 2017 16:36:24 +0100 Subject: COMPMID-452 CL Generic Depthwise Convolution implementation. Change-Id: I115e48fe6ce5e281f3791aa5d80fdc754cdd2b5e Reviewed-on: http://mpd-gerrit.cambridge.arm.com/85082 Tested-by: Kaizen Reviewed-by: Gian Marco Iodice --- src/core/CL/cl_kernels/depthwise_convolution.cl | 134 +++++++++++++++++++++++- 1 file changed, 133 insertions(+), 1 deletion(-) (limited to 'src/core/CL/cl_kernels/depthwise_convolution.cl') diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl index cbcdbf2a34..9c2c3a5b37 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution.cl @@ -24,6 +24,8 @@ #include "helpers.h" +#if defined(CONV_STRIDE_X) + #if CONV_STRIDE_X == 1 #define convolution1x3 convolution1x3_stride_1 #elif CONV_STRIDE_X == 2 @@ -186,4 +188,134 @@ __kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECL weights_values2.s0, weights_values2.s1, weights_values2.s2); vstore2(pixels, 0, (__global float *)dst.ptr); -} \ No newline at end of file +} + +#endif //defined(CONV_STRIDE_X) + +#if defined(SRC_WIDTH) && defined(DATA_TYPE) +/** This kernel reshapes each of the tensor's low three dimensions to single rows. + * + * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y; + + for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr) + { + *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr; + } +} +#endif //defined(SRC_WIDTH) && defined(DATA_TYPE) + +#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) +/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_X, -DPAD_Y, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ + +__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + const int src_pixel_linear = get_global_id(1) * STRIDE_X; + const int full_length = SRC_WIDTH + 2 * PAD_X; + const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1); + + const int src_x = -PAD_X + src_pixel_linear % max_initial_x; + const int src_y = -PAD_Y + src_pixel_linear / max_initial_x * STRIDE_Y; + const int src_z = get_global_id(2); + + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z; + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr)); + + for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y) + { + for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr) + { + if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) + { + *output_ptr = 0; + } + else + { + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); + } + } + } +} + +#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) + +#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE) + +/** This kernel performs a reshaping of the output of the depthwise generic convolution. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void depthwise_vector_to_tensor( + VECTOR_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Vector src = CONVERT_TO_VECTOR_STRUCT(src); + + const int patch_size = CONV_WIDTH * CONV_HEIGHT; + const int id0 = get_global_id(0); + const int z = id0 / patch_size; + const int index2D = id0 - z * patch_size; + + __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z; + *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr); +} + +#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE) -- cgit v1.2.1