diff options
author | Anthony Barbier <anthony.barbier@arm.com> | 2017-11-28 10:31:43 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:41:58 +0000 |
commit | fcd52fbc578a2f5e6a1df4c823284621cc55645a (patch) | |
tree | b6e7430b2e69fa26fa2405723f827a7e7dc73447 /src/core/CL/kernels | |
parent | 666635c68ebbb182d1db4a85f33ed5325d472a65 (diff) | |
download | ComputeLibrary-fcd52fbc578a2f5e6a1df4c823284621cc55645a.tar.gz |
COMPMID-661: Vectorize im2col and add lws heuristics for convolution kernels #46
Change-Id: Idaab987384d6a12a114f609abd50446fd94536b2
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110879
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r-- | src/core/CL/kernels/CLCol2ImKernel.cpp | 15 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 14 | ||||
-rw-r--r-- | src/core/CL/kernels/CLIm2ColKernel.cpp | 54 |
3 files changed, 80 insertions, 3 deletions
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp index 31cc6448c9..f2886c569a 100644 --- a/src/core/CL/kernels/CLCol2ImKernel.cpp +++ b/src/core/CL/kernels/CLCol2ImKernel.cpp @@ -72,6 +72,21 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts)); + // Configure the local work size for Bifrost with a value obtained + // via exhaustive autotuning over 30 representative tensor shapes. + const GPUTarget gpu_target = get_arch_from_target(get_target()); + if(gpu_target == GPUTarget::BIFROST) + { + if((_convolved_dims.first == 7) || (_convolved_dims.first == 14)) + { + _lws_hint = cl::NDRange(1, 7, 1); + } + else + { + _lws_hint = cl::NDRange(1, 8, 1); + } + } + // Configure window Window win = calculate_max_window(*input->info(), Steps()); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index d39dcdb336..16706dd748 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -68,7 +68,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen GPUTarget arch_target = get_arch_from_target(get_target()); // Configure LWS hint - _lws_hint = (output->info()->dimension(1) == 196) ? cl::NDRange(1, 7) : cl::NDRange(8, 8); + if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24) + { + // LWS optimized for the 11x11 AlexNet convolution on Bifrost. + _lws_hint = cl::NDRange(2, 2); + } + else if(output->info()->dimension(1) == 196) + { + _lws_hint = cl::NDRange(1, 7); + } + else + { + _lws_hint = cl::NDRange(8, 8); + } // Create build options CLBuildOptions build_opts; diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 07372c7b91..f7cf9a3cb4 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -53,7 +53,8 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _input = input; _output = output; - const DataType data_type = input->info()->data_type(); + const DataType data_type = input->info()->data_type(); + const GPUTarget gpu_target = get_arch_from_target(get_target()); // Create kernel CLBuildOptions build_opts; @@ -98,6 +99,56 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const if(kernel_dims.width == 3 && kernel_dims.height == 3 && !conv_info.has_padding()) { kernel_name = "im2col_kernel3x3_padx0_pady0"; + + // Local work size optimized for the 3x3 MobileNets convolution on Bifrost. + if(gpu_target == GPUTarget::BIFROST && input->info()->dimension(0) == 224) + { + _lws_hint = cl::NDRange(2, 3, 3); + } + } + else if(kernel_dims.width > 1 && !conv_info.has_padding()) + { + kernel_name = "im2col_generic_padx0_pady0"; + + // Optimized im2col is performed using one or more vector operations with the specified vector size + // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4 + // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3. + // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3. + // Using the vector size of 8, however, may be faster. + size_t vector_size = 4; + // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0 + // is used instead.) + if(kernel_dims.width < vector_size) + { + vector_size = kernel_dims.width; + } + // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost. + if(gpu_target == GPUTarget::BIFROST && kernel_dims.width == 11) + { + _lws_hint = cl::NDRange(1, 1, 1); + vector_size = 8; + } + const size_t width_mod_vector_size = kernel_dims.width % vector_size; + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size)); + } + else + { + if(gpu_target == GPUTarget::BIFROST) + { + const size_t input_channels = input->info()->dimension(2); + if((input_channels & (input_channels - 1)) == 0) + { + // input_channels is a power of two + _lws_hint = cl::NDRange(1, 1, 4); + } + else if(input_channels < 192 && (input_channels % 4) == 0) + { + // input_channels is less than 192 and is a multiple of 4 + _lws_hint = cl::NDRange(1, 1, 2); + } + // otherwise the default is optimal + } } _run_func = &CLIm2ColKernel::run_generic; } @@ -173,7 +224,6 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice_in); add_2D_tensor_argument(idx, _output, slice_out); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->dimension(2))); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3])); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3])); enqueue(queue, *this, slice, _lws_hint); |