diff options
Diffstat (limited to 'src/gpu/cl/kernels/ClDirectConv2dKernel.cpp')
-rw-r--r-- | src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 114 |
1 files changed, 40 insertions, 74 deletions
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index be4c8ef5b7..c4b70ca82b 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -23,11 +23,11 @@ */ #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h" -#include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" @@ -40,6 +40,7 @@ #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" + namespace arm_compute { namespace opencl @@ -49,7 +50,7 @@ namespace kernels namespace { Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -83,6 +84,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } + if(data_layout == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + "N0 can only be: 1, 2, 3, 4, 8, and 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + "K0 can only be: 1, 2, 3, 4, 8, and 16"); + if(desc.export_weights_to_cl_image) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + "K0 can only be: 4, 8, and 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_weights_to_cl_image(weights), + "Export to CLImage is not supported for this weight configuration"); + } + } + if(biases != nullptr) { if(is_data_type_quantized_asymmetric(src->data_type())) @@ -121,50 +137,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } return Status{}; } - -bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) -{ - if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) - { - return false; - } - - // If not floating point - if(!is_data_type_float(tensor->data_type())) - { - return false; - } - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) - { - return false; - } - - // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) - { - return false; - } - - const size_t image_w = tensor->tensor_shape()[0] / 4; - const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; - const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); - const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); - - if(image_w > max_image_w || image_h > max_image_h) - { - return false; - } - - return true; -} - } // namespace ClDirectConv2dKernel::ClDirectConv2dKernel() @@ -173,12 +145,12 @@ ClDirectConv2dKernel::ClDirectConv2dKernel() } void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); // Perform validation - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc)); const int conv_stride_x = std::get<0>(conv_info.stride()); const int conv_stride_y = std::get<1>(conv_info.stride()); @@ -208,15 +180,12 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT Window win; if(_data_layout == DataLayout::NHWC) { - const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u); - unsigned int num_rows = 1U; - if(dst->tensor_shape()[0] > 16) - { - num_rows = src->data_type() == DataType::F32 ? 2U : 4U; - } + output_shape.collapse(2U, 1U); + const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]); + const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1]); // Create window and update padding - win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + win = calculate_max_window(output_shape, Steps(n0, m0)); } else if(_data_layout == DataLayout::NCHW) { @@ -233,16 +202,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT { kernel_name << "direct_convolution_nhwc"; - const unsigned int n0 = win.x().step(); - const unsigned int m0 = win.y().step(); - const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; - const unsigned int pad_left = conv_info.pad_left(); - const unsigned int pad_top = conv_info.pad_top(); - const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout); + const unsigned int n0 = win.x().step(); + const unsigned int m0 = win.y().step(); + const unsigned int k0 = adjust_vec_size(desc.k0, src->dimension(channel_idx)); + const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; + const unsigned int pad_left = conv_info.pad_left(); + const unsigned int pad_top = conv_info.pad_top(); + + _export_to_cl_image = desc.export_weights_to_cl_image; // Update the padding for the weights tensor if we can export to cl_image - if(export_to_cl_image) + if(_export_to_cl_image) { gemm::update_padding_for_cl_image(weights); } @@ -274,7 +244,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type)); - build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); @@ -325,6 +295,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT } else { + _export_to_cl_image = false; + kernel_name << "direct_convolution_nchw"; build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); @@ -393,9 +365,9 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT } Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc)); return Status{}; } @@ -416,13 +388,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl { cl::Image2D weights_cl_image; - const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); - const bool export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout); - - slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step())); - slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1)); - - if(export_to_cl_image) + if(_export_to_cl_image) { const size_t image_w = weights->info()->dimension(0) / 4; const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); @@ -436,7 +402,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl unsigned int idx = 0; add_4d_tensor_nhwc_argument(idx, src); add_4d_tensor_nhwc_argument(idx, dst); - if(export_to_cl_image) + if(_export_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } |