From 9f26b3e794228d0e2804e5f1f7fe887124e4e825 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Tue, 28 Nov 2017 14:35:00 +0000 Subject: COMPMID-617 Add validation window to CLPoolingLayer Change-Id: I1ef117399ff694c34178dd973458d52cd5ebf1f6 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110939 Reviewed-by: Anthony Barbier Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com --- src/core/CL/kernels/CLPoolingLayerKernel.cpp | 241 +++++++++++++++++---------- 1 file changed, 151 insertions(+), 90 deletions(-) (limited to 'src/core/CL/kernels/CLPoolingLayerKernel.cpp') diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index 26c26be1fb..9db5c48a3b 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -41,6 +41,130 @@ using namespace arm_compute; +namespace +{ +// Internal window config info +using CLPoolingConfig = std::pair; //num_elems_processed_per_iteration, border_size + +void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h) +{ + TensorShape output_shape{ input->tensor_shape() }; + output_shape.set(0, pooled_w); + output_shape.set(1, pooled_h); + + auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position(), input->quantization_info()); +} + +Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2), + "Unsupported combination of parameters!"); + + const bool is_global_pooling = pool_info.is_global_pooling(); + const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()), + "Global pooling is supported only with rectangular inputs!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)), + "Invalid pool size and pool pad combination!"); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), + input->dimension(1), + pool_size, + pool_size, + pool_info.pad_stride_info()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h), + "Invalid output pooling dimensions!"); + } + + return Error{}; +} + +std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info) +{ + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Update pool size in case of global pooling + pool_size = pool_info.is_global_pooling() ? input->dimension(0) : pool_size; + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), + input->dimension(1), + pool_size, + pool_size, + pad_stride_info); + + auto_init(input, output, pooled_w, pooled_h); + + BorderSize border_size = BorderSize(pool_pad_y, pool_pad_x); + const DataType data_type = input->data_type(); + + const int input_width = input->dimension(0); + const int input_height = input->dimension(1); + + unsigned int num_elems_processed_per_iteration = 1; + + if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type)) + { + const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type); + + int num_elems_read_per_iteration = pool_size; + if(is_pool3x3_stride_le3) + { + // Change the number of elements processed and the number of elements read per iteration + // for pooling 3x3 with stride less equal than 3 + num_elems_processed_per_iteration = 4; + num_elems_read_per_iteration = pool_size * (pool_stride_x + 1); + } + + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + border_size.right = std::max(upper_bound_w, pool_pad_x); + border_size.bottom = std::max(upper_bound_h, pool_pad_y); + } + else + { + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + border_size.right = std::max(upper_bound_w, pool_pad_x); + border_size.bottom = std::max(upper_bound_h, pool_pad_y); + } + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_tuple(err, win, CLPoolingConfig(num_elems_processed_per_iteration, border_size)); +} +} // namespace + CLPoolingLayerKernel::CLPoolingLayerKernel() : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1) { @@ -53,56 +177,39 @@ BorderSize CLPoolingLayerKernel::border_size() const void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info) { - int pool_pad_x = 0; - int pool_pad_y = 0; - int pool_stride_x = 0; - int pool_stride_y = 0; - unsigned int pooled_w = 0; - unsigned int pooled_h = 0; - const PoolingType pool_type = pool_info.pool_type(); - int pool_size = pool_info.pool_size(); - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); - const bool exclude_padding = pool_info.exclude_padding(); - const bool is_global_pooling = pool_info.is_global_pooling(); + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + const PoolingType pool_type = pool_info.pool_type(); + int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + const bool exclude_padding = pool_info.exclude_padding(); std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Update pool size in case of global pooling - pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size; + pool_size = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_size; // Check output dimensions std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), pool_size, pool_size, - pool_info.pad_stride_info()); + pad_stride_info); - // Output auto initialization if not yet initialized - { - TensorShape output_shape{ input->info()->tensor_shape() }; - output_shape.set(0, pooled_w); - output_shape.set(1, pooled_h); - - auto_init_if_empty(*output->info(), - output_shape, - 1, - input->info()->data_type(), - input->info()->fixed_point_position(), - input->info()->quantization_info()); - } + auto_init(input->info(), output->info(), pooled_w, pooled_h); ARM_COMPUTE_ERROR_THROW_ON(CLPoolingLayerKernel::validate(input->info(), output->info(), pool_info)); - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - // Set instance variables - _input = input; - _output = output; - _pool_info = pool_info; - _border_size = BorderSize(pool_pad_y, pool_pad_x); + _input = input; + _output = output; + _pool_info = pool_info; const GPUTarget gpu_target = get_arch_from_target(get_target()); const DataType data_type = input->info()->data_type(); @@ -131,33 +238,12 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, // each thread computes 4 output elements const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type); - int num_elems_read_per_iteration = pool_size; - if(is_pool3x3_stride_le3) - { - // Change the number of elements processed and the number of elements read per iteration - // for pooling 3x3 with stride less equal than 3 - _num_elems_processed_per_iteration = 4; - num_elems_read_per_iteration = pool_size * (pool_stride_x + 1); - } - - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; - - _border_size.right = std::max(upper_bound_w, pool_pad_x); - _border_size.bottom = std::max(upper_bound_h, pool_pad_y); - std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") + support::cpp11::to_string(pool_size); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } else // Run general case { - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; - - _border_size.right = std::max(upper_bound_w, pool_pad_x); - _border_size.bottom = std::max(upper_bound_h, pool_pad_y); - build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)); build_opts.add_option_if(data_type == DataType::F16, "-DFP16"); @@ -166,7 +252,9 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, } // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration)); + auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); // Configure the local work size (hint) from the first two dimensions of the global work size. // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized @@ -174,15 +262,15 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with). if(gpu_target == GPUTarget::BIFROST) { - cl::NDRange gws = ICLKernel::gws_from_window(win); + cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config)); _lws_hint = cl::NDRange(gws[0], gws[1], 1); } - AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); - AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - ICLKernel::configure(win); + ICLKernel::configure(std::get<1>(win_config)); + + CLPoolingConfig pooling_config = std::get<2>(win_config); + _num_elems_processed_per_iteration = pooling_config.first; + _border_size = pooling_config.second; // Set config_id for enabling LWS tuning _config_id = "pooling_layer_"; @@ -195,35 +283,8 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, Error CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2), - "Unsupported combination of parameters!"); - - const bool is_global_pooling = pool_info.is_global_pooling(); - const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()), - "Global pooling is supported only with rectangular inputs!"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)), - "Invalid pool size and pool pad combination!"); - - // Checks performed when output is configured - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - - unsigned int pooled_w = 0; - unsigned int pooled_h = 0; - std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), - input->dimension(1), - pool_size, - pool_size, - pool_info.pad_stride_info()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h), - "Invalid output pooling dimensions!"); - } + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info))); return Error{}; } -- cgit v1.2.1