From ff1fe3e32e25069fed750cdfe3046b7d8d5a2628 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Sat, 2 Jan 2021 09:58:51 +0000 Subject: Remove padding from direct convolution - OpenCL - Refactor direct convolution for NHWC - Remove old kernels for NHWC - Change the heuristic in CLConvolutionLayer.cpp. The new direct convolution implementation is faster than FFT Resolves COMPMID-3908 Change-Id: Iee15ce7b04e21847b6eaae5c6d3c1b18180e7efc Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4876 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../CL/kernels/CLDirectConvolutionLayerKernel.cpp | 467 +++++++++------------ 1 file changed, 209 insertions(+), 258 deletions(-) (limited to 'src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp') diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index 8884521794..91ff35b58d 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -53,8 +53,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, - "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective input's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); @@ -63,6 +61,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, && std::get<0>(conv_info.stride()) > 2, "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); + if(data_layout == DataLayout::NCHW) + { + if(is_data_type_quantized(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, + "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantised data types"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, + "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); + } + } + if(biases != nullptr) { if(is_data_type_quantized_asymmetric(input->data_type())) @@ -102,8 +114,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, return Status{}; } -inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, - DataType data_type, DataLayout data_layout) +inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, + DataType data_type, DataLayout data_layout) { return gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, @@ -115,29 +127,16 @@ inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned && (data_layout == DataLayout::NCHW); } -inline bool can_run_optimized_kernel_for_bifrost_nhwc(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, - DataType data_type, DataLayout data_layout) -{ - return gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT) - && (kernel_size == 9) - && (conv_stride_x == 1) && (conv_stride_y == 1) - && (data_type == DataType::F32) - && (data_layout == DataLayout::NHWC); -} - -inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, - unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, - unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input) +inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, + unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, + unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input) { const DataType data_type = input->data_type(); const DataLayout data_layout = input->data_layout(); unsigned int conv_stride_x = std::get<0>(conv_info.stride()); unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); + const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); if(run_optimized_bifrost) { @@ -174,7 +173,7 @@ inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsign } } } - else if(data_layout == DataLayout::NCHW) + else { num_elems_read_per_iteration_y = kernel_size; num_elems_written_per_iteration_x = 8; @@ -253,97 +252,13 @@ inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsign ARM_COMPUTE_ERROR("Invalid direct convolution size"); } } - else // data_layout == NHWC - { - const bool run_optimized_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); - - num_elems_written_per_iteration_x = 1; - - if(run_optimized_bifrost_nhwc) - { - num_elems_read_per_iteration_x = 4; - } - else - { - num_elems_read_per_iteration_x = 1; - } - - switch(kernel_size) - { - case 1: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_y = 8; - num_elems_written_per_iteration_y = 8; - break; - case 2: - num_elems_read_per_iteration_y = 16; - num_elems_written_per_iteration_y = 8; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 3: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_y = 10; - num_elems_written_per_iteration_y = 8; - break; - case 2: - num_elems_read_per_iteration_y = 17; - num_elems_written_per_iteration_y = 8; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 5: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_y = 12; - num_elems_written_per_iteration_y = 8; - break; - case 2: - num_elems_read_per_iteration_y = 20; - num_elems_written_per_iteration_y = 8; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 9: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_y = 16; - num_elems_written_per_iteration_y = 8; - break; - case 2: - num_elems_read_per_iteration_y = 24; - num_elems_written_per_iteration_y = 8; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - default: - ARM_COMPUTE_ERROR("Not implemented."); - break; - } - } } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target) { - const DataLayout data_layout = input->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int kernel_size = weights->dimension(width_idx); + const DataLayout data_layout = input->data_layout(); - // Get convolved dimensions + // Get output shape TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info); // Output auto inizialitation if not yet initialized @@ -352,38 +267,39 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen input->data_type(), input->quantization_info()); - unsigned int num_elems_read_per_iteration_x = 0; - unsigned int num_elems_read_per_iteration_y = 0; - unsigned int num_elems_written_per_iteration_x = 0; - unsigned int num_elems_written_per_iteration_y = 0; - - unsigned int conv_pad_left = conv_info.pad_left(); - unsigned int conv_pad_top = conv_info.pad_top(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - setup_num_elems(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, - num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, - kernel_size, conv_info, target, input); - - // Create window and update padding - bool window_changed = false; - Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); - if(data_layout == DataLayout::NHWC) { - AccessWindowStatic input_access(input, 0, -conv_pad_left, - ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x), - ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y)); - AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_read_per_iteration_x), weights->dimension(1)); - AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); - window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + const unsigned int vec_size = std::min(static_cast(output->tensor_shape()[0]), 4u); + + // Create window and update padding + Window win = calculate_max_window(*output, Steps(vec_size, 1U)); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + Status err = Status{}; return std::make_pair(err, win); } else if(data_layout == DataLayout::NCHW) { + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int kernel_size = weights->dimension(width_idx); + + unsigned int num_elems_read_per_iteration_x = 0; + unsigned int num_elems_read_per_iteration_y = 0; + unsigned int num_elems_written_per_iteration_x = 0; + unsigned int num_elems_written_per_iteration_y = 0; + + unsigned int conv_pad_left = conv_info.pad_left(); + unsigned int conv_pad_top = conv_info.pad_top(); + unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + + setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, + num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, + kernel_size, conv_info, target, input); + + // Create window and update padding + bool window_changed = false; + Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); + AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y); AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size); AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); @@ -419,25 +335,7 @@ void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_c { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - _data_layout = input->info()->data_layout(); - const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - - const unsigned int kernel_size = weights->info()->dimension(width_idx); - const DataType data_type = input->info()->data_type(); - - // Get convolved dimensions - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), - output_shape, - 1, - input->info()->data_type(), - input->info()->quantization_info()); - - // Perform validation step + // Perform validation ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, @@ -446,72 +344,64 @@ void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_c _conv_stride_x = std::get<0>(conv_info.stride()); _conv_stride_y = std::get<1>(conv_info.stride()); - - if(_data_layout == DataLayout::NHWC) - { - _border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0); - } - else if(_data_layout == DataLayout::NCHW) - { - _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - } - else - { - ARM_COMPUTE_ERROR("Not supported"); - } - - _input = input; - _weights = weights; - _output = output; - _biases = biases; + _data_layout = input->info()->data_layout(); + _input = input; + _weights = weights; + _output = output; + _biases = biases; + + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const unsigned int kernel_size = weights->info()->dimension(width_idx); + const DataType data_type = input->info()->data_type(); const GPUTarget gpu_target = get_target(); - std::stringstream kernel_name; - kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; - if(_data_layout == DataLayout::NHWC) - { - kernel_name << "_" << lower_string(string_from_data_layout(_data_layout)); - } - - CLBuildOptions build_options; - build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS")); - - const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout); + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); - if(run_optimized_for_bifrost) - { - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx)))); + std::stringstream kernel_name; + CLBuildOptions build_options; - kernel_name << "_f32_bifrost"; - _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); - } - else + if(_data_layout == DataLayout::NHWC) { - build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); - build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx)))); - build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x))); - if(_data_layout == DataLayout::NHWC) - { - const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout); - build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1")); - build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx)))); - build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx)))); - build_options.add_option(std::string("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx)))); - build_options.add_option(std::string("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx)))); - build_options.add_option(std::string("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()))); - build_options.add_option(std::string("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()))); - build_options.add_option(std::string("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()))); - build_options.add_option(std::string("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y))); - if(run_optimized_for_bifrost_nhwc) - { - const unsigned int num_elems_read_per_iteration_x = 4; - _border_size.right = num_elems_read_per_iteration_x; - build_options.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_read_per_iteration_x)); - } - } - build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); + _border_size = BorderSize(); + + kernel_name << "direct_convolution_nhwc"; + + const unsigned int n0 = win_config.second.x().step(); + const unsigned int m0 = win_config.second.y().step(); + const unsigned int k0 = std::min(static_cast(_input->info()->dimension(channel_idx)), 16u); + const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0; + const unsigned int partial_store_m0 = _output->info()->dimension(channel_idx) % m0; + const unsigned int pad_left = conv_info.pad_left(); + const unsigned int pad_top = conv_info.pad_top(); + + build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS")); + build_options.add_option_if(_biases != nullptr, std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(_biases->info()->data_type()))); + build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx))); + build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx))); + build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(_input->info()->dimension(channel_idx))); + build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type())); + build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))); + build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))); + build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->dimension(channel_idx))); + build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(_output->info()->data_type())); + build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(width_idx))); + build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(height_idx))); + build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type())); + build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)); + build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)); + build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left)); + build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top)); + build_options.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_options.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); + build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); if(is_data_type_quantized(data_type)) { @@ -523,33 +413,74 @@ void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_c int output_multiplier = 0; int output_shift = 0; quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); - - // Create kernel - _kernel = create_kernel(compile_context, "direct_convolution_quantized", build_options.options()); - - // Set static kernel arguments - unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1; - _kernel.setArg(idx++, -iqinfo.offset); - _kernel.setArg(idx++, -wqinfo.offset); - _kernel.setArg(idx++, oqinfo.offset); + build_options.add_option("-DIS_QUANTISED"); + build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); } else { - // Create kernel - _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type)); } } + else + { + _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); + kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; + + build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS")); + + const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout); + + if(run_optimized_for_bifrost) + { + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx)))); + + kernel_name << "_f32_bifrost"; + } + else + { + build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); + build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx)))); + build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x))); + build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); + + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); + build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + + kernel_name.str("direct_convolution_quantized"); + } + } + } + + _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); // Set config_id for enabling LWS tuning - _config_id = "direct_convolution_"; + _config_id = kernel_name.str(); + _config_id += "_"; _config_id += lower_string(string_from_data_type(data_type)); _config_id += "_"; _config_id += support::cpp11::to_string(kernel_size); @@ -588,38 +519,58 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); // Get initial windows - Window slice = window.first_slice_window_3D(); - Window win_in = window; + Window slice = window.first_slice_window_3D(); - win_in.adjust(Window::DimX, -_border_size.left, true); - win_in.adjust(Window::DimY, -_border_size.top, true); + if(_data_layout == DataLayout::NHWC) + { + slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1) * _output->info()->dimension(2), 1)); + slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(3), 1)); - const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + add_3D_tensor_argument(idx, _weights, slice); + if(_biases != nullptr) + { + add_1D_tensor_argument(idx, _biases, slice); + } + _kernel.setArg(idx++, static_cast(_weights->info()->strides_in_bytes()[3])); + enqueue(queue, *this, slice, lws_hint()); + } + else + { + Window win_in = window; - win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x); - win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y); + win_in.adjust(Window::DimX, -_border_size.left, true); + win_in.adjust(Window::DimY, -_border_size.top, true); - Window slice_in = win_in.first_slice_window_3D(); - unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); - add_3D_tensor_argument(idx1, _weights, slice); + const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - if(_biases != nullptr) - { - Window slice_biases; - slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape()); - add_1D_tensor_argument(idx1, _biases, slice_biases); - } + win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x); + win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y); - _kernel.setArg(idx1++, static_cast(_weights->info()->strides_in_bytes()[3])); + Window slice_in = win_in.first_slice_window_3D(); + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + add_3D_tensor_argument(idx1, _weights, slice); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); + if(_biases != nullptr) + { + Window slice_biases; + slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape()); + add_1D_tensor_argument(idx1, _biases, slice_biases); + } + + _kernel.setArg(idx1++, static_cast(_weights->info()->strides_in_bytes()[3])); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); } - while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); } } // namespace arm_compute -- cgit v1.2.1