From fcd52fbc578a2f5e6a1df4c823284621cc55645a Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Tue, 28 Nov 2017 10:31:43 +0000 Subject: COMPMID-661: Vectorize im2col and add lws heuristics for convolution kernels #46 Change-Id: Idaab987384d6a12a114f609abd50446fd94536b2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110879 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Anthony Barbier --- src/core/CL/CLKernelLibrary.cpp | 1 + src/core/CL/cl_kernels/convolution_layer.cl | 103 ++++++++++++++++++--- src/core/CL/kernels/CLCol2ImKernel.cpp | 15 +++ src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 14 ++- src/core/CL/kernels/CLIm2ColKernel.cpp | 54 ++++++++++- 5 files changed, 172 insertions(+), 15 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index a4b88b8eb2..de75518a05 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -251,6 +251,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "hog_orientation_binning", "hog.cl" }, { "hysteresis", "canny.cl" }, { "im2col_generic", "convolution_layer.cl" }, + { "im2col_generic_padx0_pady0", "convolution_layer.cl" }, { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cl" }, { "im2col_reduced", "convolution_layer.cl" }, { "init_level", "optical_flow_pyramid_lk.cl" }, diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl index c7e3e644f4..ce0849bf7a 100644 --- a/src/core/CL/cl_kernels/convolution_layer.cl +++ b/src/core/CL/cl_kernels/convolution_layer.cl @@ -117,27 +117,25 @@ __kernel void reshape_to_columns( * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] filter_depth The depth of the used filter * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col_generic( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), - uint filter_depth, uint src_stride_w, uint dst_stride_w) { const int xc = get_global_id(0); // x coordinate in the convolved tensor const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % filter_depth; // input feature map - const int batch = get_global_id(2) / filter_depth; // the batch + const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map + const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size - // Calculate input indeces + // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; const int yi = yc * STRIDE_Y - PAD_TOP; - // Calculate output indeces + // Calculate output indices const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution @@ -199,27 +197,25 @@ __kernel void im2col_generic( * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] filter_depth The depth of the used filter * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ __kernel void im2col_kernel3x3_padx0_pady0( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), - uint filter_depth, uint src_stride_w, uint dst_stride_w) { const int xc = get_global_id(0); // x coordinate in the convolved tensor const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % filter_depth; // input feature map - const int batch = get_global_id(2) / filter_depth; // the batch + const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map + const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size - // Calculate input indeces + // Calculate input indices const int xi = xc * STRIDE_X; const int yi = yc * STRIDE_Y; - // Calculate output indeces + // Calculate output indices const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution @@ -336,3 +332,86 @@ __kernel void im2col_reduced( } #endif // HAS_BIAS } + +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when + * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0. + * + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. + * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4. + * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col_generic_padx0_pady0( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map + const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + + // Calculate input indices + const int xi = xc * STRIDE_X; + const int yi = yc * STRIDE_Y; + // Calculate output indices + const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; + const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; + // Linearize convolution elements + for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y) + { + int last_x = 0; + for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE) + { + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); + VSTORE(VECTOR_SIZE) + (row, 0, output_ptr); + last_x = x; + } + // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE). + // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit. +#if WIDTH_MOD_VECTOR_SIZE == 1 + *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); +#elif WIDTH_MOD_VECTOR_SIZE > 1 + VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE) + row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y)); + VSTORE(WIDTH_MOD_VECTOR_SIZE) + (row, 0, output_ptr); +#endif /* WIDTH_MOD_VECTOR_SIZE */ + output_ptr += WIDTH_MOD_VECTOR_SIZE; + } /* End of loop over KERNEL_HEIGHT */ + +#ifdef HAS_BIAS + if(ch == (KERNEL_DEPTH - 1)) + { +#ifdef FIXED_POINT_POSITION + *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION); +#else // FIXED_POINT_POSITION + *output_ptr = 1.0f; +#endif // FIXED_POINT_POSITION + } +#endif // HAS_BIAS +} +#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp index 31cc6448c9..f2886c569a 100644 --- a/src/core/CL/kernels/CLCol2ImKernel.cpp +++ b/src/core/CL/kernels/CLCol2ImKernel.cpp @@ -72,6 +72,21 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p _kernel = static_cast(CLKernelLibrary::get().create_kernel("col2im", build_opts)); + // Configure the local work size for Bifrost with a value obtained + // via exhaustive autotuning over 30 representative tensor shapes. + const GPUTarget gpu_target = get_arch_from_target(get_target()); + if(gpu_target == GPUTarget::BIFROST) + { + if((_convolved_dims.first == 7) || (_convolved_dims.first == 14)) + { + _lws_hint = cl::NDRange(1, 7, 1); + } + else + { + _lws_hint = cl::NDRange(1, 8, 1); + } + } + // Configure window Window win = calculate_max_window(*input->info(), Steps()); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index d39dcdb336..16706dd748 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -68,7 +68,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen GPUTarget arch_target = get_arch_from_target(get_target()); // Configure LWS hint - _lws_hint = (output->info()->dimension(1) == 196) ? cl::NDRange(1, 7) : cl::NDRange(8, 8); + if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24) + { + // LWS optimized for the 11x11 AlexNet convolution on Bifrost. + _lws_hint = cl::NDRange(2, 2); + } + else if(output->info()->dimension(1) == 196) + { + _lws_hint = cl::NDRange(1, 7); + } + else + { + _lws_hint = cl::NDRange(8, 8); + } // Create build options CLBuildOptions build_opts; diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 07372c7b91..f7cf9a3cb4 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -53,7 +53,8 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _input = input; _output = output; - const DataType data_type = input->info()->data_type(); + const DataType data_type = input->info()->data_type(); + const GPUTarget gpu_target = get_arch_from_target(get_target()); // Create kernel CLBuildOptions build_opts; @@ -98,6 +99,56 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const if(kernel_dims.width == 3 && kernel_dims.height == 3 && !conv_info.has_padding()) { kernel_name = "im2col_kernel3x3_padx0_pady0"; + + // Local work size optimized for the 3x3 MobileNets convolution on Bifrost. + if(gpu_target == GPUTarget::BIFROST && input->info()->dimension(0) == 224) + { + _lws_hint = cl::NDRange(2, 3, 3); + } + } + else if(kernel_dims.width > 1 && !conv_info.has_padding()) + { + kernel_name = "im2col_generic_padx0_pady0"; + + // Optimized im2col is performed using one or more vector operations with the specified vector size + // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4 + // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3. + // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3. + // Using the vector size of 8, however, may be faster. + size_t vector_size = 4; + // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0 + // is used instead.) + if(kernel_dims.width < vector_size) + { + vector_size = kernel_dims.width; + } + // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost. + if(gpu_target == GPUTarget::BIFROST && kernel_dims.width == 11) + { + _lws_hint = cl::NDRange(1, 1, 1); + vector_size = 8; + } + const size_t width_mod_vector_size = kernel_dims.width % vector_size; + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size)); + } + else + { + if(gpu_target == GPUTarget::BIFROST) + { + const size_t input_channels = input->info()->dimension(2); + if((input_channels & (input_channels - 1)) == 0) + { + // input_channels is a power of two + _lws_hint = cl::NDRange(1, 1, 4); + } + else if(input_channels < 192 && (input_channels % 4) == 0) + { + // input_channels is less than 192 and is a multiple of 4 + _lws_hint = cl::NDRange(1, 1, 2); + } + // otherwise the default is optimal + } } _run_func = &CLIm2ColKernel::run_generic; } @@ -173,7 +224,6 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice_in); add_2D_tensor_argument(idx, _output, slice_out); - _kernel.setArg(idx++, static_cast(_input->info()->dimension(2))); _kernel.setArg(idx++, static_cast(_input->info()->strides_in_bytes()[3])); _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[3])); enqueue(queue, *this, slice, _lws_hint); -- cgit v1.2.1