From 7333e1f10f5da9dc67b511d326121a843771a107 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 8 Oct 2020 10:25:49 +0100 Subject: COMPMID-3732: Remove OpenCL padding from CLPoolingLayer - Refactor pooling layer kernels on OpenCL (F32/F16/QASYMM8) to avoid padding and improve performance - Add test for checking zero padding requirement - Fix issue with extracting the index. The issue was caused by the padding passed at compile time - auto_init indices tensor in CLPoolingLayerKernel Change-Id: I1ae5a2ef8c4ce787c80dcd73e35c17bb34623cb5 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4188 Reviewed-by: Michele Di Giorgio Reviewed-by: Giorgio Arena Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/core/CL/kernels/CLPoolingLayerKernel.cpp | 134 ++++++++++++++------------- 1 file changed, 71 insertions(+), 63 deletions(-) (limited to 'src/core/CL/kernels/CLPoolingLayerKernel.cpp') diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index d60e196b7f..1771834aac 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -50,10 +50,14 @@ namespace // Internal window config info using CLPoolingConfig = std::pair; //num_elems_processed_per_iteration, border_size -void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info) +void auto_init(const ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, PoolingLayerInfo pool_info) { TensorShape out_shape = compute_pool_shape(*input, pool_info); auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape)); + if(indices) + { + auto_init_if_empty(*indices, input->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); + } } Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) @@ -63,16 +67,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2), "Unsupported combination of parameters!"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() - && (input->data_layout() == DataLayout::NHWC), - "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); + // Check indices if(indices) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + + if(indices->total_size() != 0) + { + TensorInfo idx_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, DataType::U32)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); + } } // Checks performed when output is configured @@ -108,9 +115,9 @@ std::tuple validate_and_configure_window(ITenso const int pool_pad_top = pad_stride_info.pad_top(); const int pool_pad_left = pad_stride_info.pad_left(); const int pool_pad_bottom = pad_stride_info.pad_bottom(); - BorderSize border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); + BorderSize border_size = BorderSize(); - auto_init(input, output, pool_info); + auto_init(input, output, indices, pool_info); pooled_w = output->tensor_shape()[idx_width]; pooled_h = output->tensor_shape()[idx_height]; @@ -126,6 +133,8 @@ std::tuple validate_and_configure_window(ITenso { case DataLayout::NCHW: { + // Initialize border size + border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); // Change the number of elements processed per iteration // for pooling 3x3 with stride less equal than 3 const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); @@ -165,27 +174,17 @@ std::tuple validate_and_configure_window(ITenso } case DataLayout::NHWC: { - num_elems_processed_per_iteration = 8; + // Initialize border size + border_size = BorderSize(); + num_elems_processed_per_iteration = adjust_vec_size(4, output->dimension(0)); win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowStatic input_access(input, - 0, -1, - ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1)); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - // Update indices window - if(indices) + if(indices != nullptr) { - AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access, indices_access); - indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); - } - else - { - window_changed = update_window_and_padding(win, input_access, output_access); + indices->set_valid_region(ValidRegion(Coordinates(), indices->tensor_shape())); } - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); break; } default: @@ -228,6 +227,7 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); const int pool_size_x = pool_info.is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width; const int pool_size_y = pool_info.is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height; const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; @@ -246,17 +246,11 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); - if(_data_layout == DataLayout::NCHW) - { - CLPoolingConfig pooling_config = std::get<2>(win_config); - _num_elems_processed_per_iteration = pooling_config.first; - _border_size = pooling_config.second; - } - else - { - _border_size = BorderSize(1, 0, 0, 0); - _num_elems_processed_per_iteration = 8; - } + CLPoolingConfig pooling_config = std::get<2>(win_config); + _num_elems_processed_per_iteration = pooling_config.first; + _border_size = pooling_config.second; + + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); // Tensor paddings are used to calculate the indicies for MAX pooling if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type)) @@ -282,7 +276,8 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co } // Check output dimensions - auto_init(input->info(), output->info(), pool_info); + auto_init(input->info(), output->info(), indices ? indices->info() : nullptr, pool_info); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); @@ -314,19 +309,20 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co build_opts.add_option("-DINITIAL_VALUE=0"); } - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; - const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); - const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type); - build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); - build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); + build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); + build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); // Create kernel switch(_data_layout) { case DataLayout::NCHW: { - build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); - build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); + const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; + const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); + const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type); + build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); + build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); + if(pool_type != PoolingType::MAX) { build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); @@ -365,26 +361,38 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co } case DataLayout::NHWC: { + // Floating point mixed precision is support on F16 only + const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + + // Wider accumulation is required to avoid accuracy loss + // Case 1: Floating point mixed precision (fp16 input data and fp32 accumulation) + // Cast 2: Quantized (int8/uint8 input data and int32 accumulation ) + DataType acc_data_type = data_type; + + if(use_fp_mixed_precision) + { + acc_data_type = DataType::F32; + } + else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) + { + acc_data_type = DataType::S32; + } + + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type)); + build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION"); build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); - build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width))); - build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height))); - build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1, - "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height))); - build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1, - "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3))); - - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type)) + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width))); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height))); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); + build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output->info()->dimension(idx_channel))); + build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch_size))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % _num_elems_processed_per_iteration)); + if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) { - if(data_type == DataType::F32) - { - std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else if(data_type == DataType::F16) - { - std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } + build_opts.add_option_if(_indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); + + std::string kernel_name = "pooling_layer_2x2_nhwc"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); } else { @@ -452,7 +460,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, slice); - if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) { add_3D_tensor_argument(idx, _indices, slice); } @@ -463,14 +471,14 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) } case DataLayout::NHWC: { - const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3); + const size_t batch_size = _output->info()->tensor_shape().total_size_upper(3); Window slice = window_collapsed.first_slice_window_4D(); Window in_slice = window_collapsed.first_slice_window_4D(); in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration)); in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x)); in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y)); - in_slice.set(3, Window::Dimension(0, total_batches, 1)); + in_slice.set(3, Window::Dimension(0, batch_size, 1)); do { // Set inputs -- cgit v1.2.1