From ad9a7ed2f9969381af0b9c97438a3402e16d9483 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 16 Sep 2022 14:14:21 +0100 Subject: Rework DepthwiseConvolution heuristic on OpenCL Resolves COMPMID-5632 Change-Id: I2bdbe69a610ca2510fbd74d5d412842679299762 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8365 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Reviewed-by: Jakub Sujak Comments-Addressed: Arm Jenkins --- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 75 +++++++++++++++------- 1 file changed, 51 insertions(+), 24 deletions(-) (limited to 'src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp') diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 277cba47a6..cded31936c 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -59,7 +59,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_weights_to_cl_image(weights) == false), "Export to cl_image not supported!"); + ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!"); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); @@ -161,7 +162,8 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel _depth_multiplier(1), _output_multipliers(nullptr), _output_shifts(nullptr), - _export_to_cl_image(false), + _export_input_to_cl_image(false), + _export_weights_to_cl_image(false), _is_quantized(false) { _type = CLKernelType::DEPTHWISE; @@ -192,15 +194,16 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info); auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info())); - _input = input; - _output = output; - _weights = weights; - _biases = biases; - _depth_multiplier = conv_info.depth_multiplier; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; - _export_to_cl_image = dwc_info.export_weights_to_cl_image; - _is_quantized = is_data_type_quantized(input->info()->data_type()); + _input = input; + _output = output; + _weights = weights; + _biases = biases; + _depth_multiplier = conv_info.depth_multiplier; + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _export_input_to_cl_image = dwc_info.export_input_to_cl_image; + _export_weights_to_cl_image = dwc_info.export_weights_to_cl_image; + _is_quantized = is_data_type_quantized(input->info()->data_type()); const unsigned int n0 = adjust_vec_size(dwc_info.n0, output->info()->dimension(0)); const unsigned int m0 = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1)); @@ -208,8 +211,13 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & CLBuildOptions build_opts; - // Update the padding for the weights tensor if we can export to cl_image - if(_export_to_cl_image) + // Update the padding for the input/weights tensor if we can export to cl_image + if(_export_input_to_cl_image) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info()); + } + + if(_export_weights_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info()); } @@ -234,14 +242,18 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function))); build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier)); - build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER"); + build_opts.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER"); // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could // have a case where the data types for the activation and weights are different. However, since the implementation // only works when both have same data type, we have to change the offset to take into account this aspect build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type())); build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER"); build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type)); - build_opts.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_opts.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1))); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2))); + build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1))); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2))); build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1))); build_opts.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(2))); build_opts.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type())); @@ -353,24 +365,39 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm Window slice = window_collapsed.first_slice_window_4D(); + cl::Image2D input_cl_image; cl::Image2D weights_cl_image; - if(_export_to_cl_image) + if(_export_input_to_cl_image || _export_weights_to_cl_image) { - const size_t image_w = _weights->info()->dimension(0) / 4; - const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); - const TensorShape shape2d(image_w, image_h); - const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1]; - // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch); + if(_export_input_to_cl_image) + { + const size_t image_w = _input->info()->dimension(0) / 4; + const size_t image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = _input->info()->strides_in_bytes()[1]; + input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch); + } + + if(_export_weights_to_cl_image) + { + const size_t image_w = _weights->info()->dimension(0) / 4; + const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1]; + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch); + } } unsigned int idx = 0; + if(_export_input_to_cl_image) + { + _kernel.setArg(idx++, input_cl_image); + } add_4d_tensor_nhwc_argument(idx, _input); add_4d_tensor_nhwc_argument(idx, _output); - - if(_export_to_cl_image) + if(_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } -- cgit v1.2.1