From a527e8c0ac7a82de4618dfe6aa312d4f6ca2e485 Mon Sep 17 00:00:00 2001 From: Isabella Gottardi Date: Wed, 31 Jan 2018 17:49:25 +0000 Subject: COMPMID-828 - Add support for pool widths 4, 5 & 6 and for non square data sizes - Part 2 (CL) Change-Id: I004906b9b1f11158fe17b4aa2640a7f4685fb929 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118462 Tested-by: Jenkins Reviewed-by: Michele DiGiorgio Reviewed-by: Anthony Barbier --- src/core/CL/CLKernelLibrary.cpp | 4 +- src/core/CL/cl_kernels/pooling_layer.cl | 28 ++++++------- src/core/CL/cl_kernels/pooling_layer_quantized.cl | 20 ++++----- src/core/CL/kernels/CLPoolingLayerKernel.cpp | 51 ++++++++++------------- 4 files changed, 49 insertions(+), 54 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index c26d8d80a6..8693a728ba 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -302,8 +302,8 @@ const std::map CLKernelLibrary::_kernel_program_map = { "pooling_layer_3", "pooling_layer.cl" }, { "pooling_layer_optimized_3", "pooling_layer.cl" }, { "pooling_layer_7", "pooling_layer.cl" }, - { "pooling_layer_N", "pooling_layer.cl" }, - { "pooling_layer_N_quantized", "pooling_layer_quantized.cl" }, + { "pooling_layer_MxN", "pooling_layer.cl" }, + { "pooling_layer_MxN_quantized", "pooling_layer_quantized.cl" }, { "quantization_layer", "quantization_layer.cl" }, { "reduction_operation", "reduction_operation.cl" }, { "remap_nearest_neighbour", "remap.cl" }, diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index ee8ff27ab7..dae0b99908 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -183,13 +183,13 @@ res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \ }) -DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, +DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x = get_global_id(0) * stride_x - pad_x; int start_y = get_global_id(1) * stride_y - pad_y; - const int end_x = min(start_x + pool_size, upper_bound_w); - const int end_y = min(start_y + pool_size, upper_bound_h); + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); #if defined(EXCLUDE_PADDING) start_x = max(0, start_x); start_y = max(0, start_y); @@ -249,7 +249,7 @@ __kernel void pooling_layer_2( #if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average or l2 pooling - res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); + res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); #endif /* defined(POOL_AVG) || defined(POOL_L2) */ #if defined(POOL_L2) @@ -317,7 +317,7 @@ __kernel void pooling_layer_3( #if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling - res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); + res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); #endif /* defined(POOL_AVG) || defined(POOL_L2) */ #if defined(POOL_L2) @@ -403,7 +403,7 @@ __kernel void pooling_layer_optimized_3( } #endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION) -#if defined(POOL_SIZE) +#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) // Set the initial value for the pooling operation accordingly with the data type #if defined(POOL_AVG) || defined(POOL_L2) @@ -427,7 +427,7 @@ __kernel void pooling_layer_optimized_3( * * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32; * @note -DFP16 must be passed at compile time if half float data type is used - * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13; + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; * @note In case of average pooling the following information must be passed at compile time: * -DPOOL_AVG must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) @@ -451,7 +451,7 @@ __kernel void pooling_layer_optimized_3( * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image */ -__kernel void pooling_layer_N( +__kernel void pooling_layer_MxN( TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { @@ -464,10 +464,10 @@ __kernel void pooling_layer_N( DATA_TYPE sdata = INITIAL_VALUE; // Load data - for(int y = 0; y < POOL_SIZE; y++) + for(int y = 0; y < POOL_SIZE_Y; y++) { int x = 0; - for(; x <= ((int)POOL_SIZE - 8); x += 8) + for(; x <= ((int)POOL_SIZE_X - 8); x += 8) { VEC_DATA_TYPE(DATA_TYPE, 8) data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); @@ -479,7 +479,7 @@ __kernel void pooling_layer_N( } // Leftover - for(; x < (int)POOL_SIZE; ++x) + for(; x < (int)POOL_SIZE_X; ++x) { DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); #if defined(POOL_L2) @@ -500,7 +500,7 @@ __kernel void pooling_layer_N( #if defined(POOL_AVG) || defined(POOL_L2) // Divide by pool region in case of average pooling - res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); + res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); #endif /* defined(POOL_AVG) || defined(POOL_L2) */ #if defined(POOL_L2) @@ -511,4 +511,4 @@ __kernel void pooling_layer_N( // Store result *(__global DATA_TYPE *)output.ptr = res; } -#endif // defined(POOL_SIZE) +#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl index 39c2c22016..98850c00a5 100644 --- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl +++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -35,13 +35,13 @@ #error "L2 pooling is not supported" #endif /* defined(POOL_L2) */ -int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, +int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x = get_global_id(0) * stride_x - pad_x; int start_y = get_global_id(1) * stride_y - pad_y; - const int end_x = min(start_x + pool_size, upper_bound_w); - const int end_y = min(start_y + pool_size, upper_bound_h); + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); #if defined(EXCLUDE_PADDING) start_x = max(0, start_x); start_y = max(0, start_y); @@ -51,7 +51,7 @@ int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int /** Performs a pooling function of pool size equal to N * - * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13; + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; * @note In case of average pooling the following information must be passed at compile time: * -DPOOL_AVG must be provided otherwise max pooling will be performed. * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) @@ -75,7 +75,7 @@ int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image */ -__kernel void pooling_layer_N_quantized( +__kernel void pooling_layer_MxN_quantized( TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { @@ -87,10 +87,10 @@ __kernel void pooling_layer_N_quantized( int sdata = 0; // Load data - for(int y = 0; y < POOL_SIZE; y++) + for(int y = 0; y < POOL_SIZE_Y; y++) { int x = 0; - for(; x <= ((int)POOL_SIZE - 8); x += 8) + for(; x <= ((int)POOL_SIZE_X - 8); x += 8) { uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0)); int8 data0 = convert_int8(data); @@ -98,7 +98,7 @@ __kernel void pooling_layer_N_quantized( } // Leftover - for(; x < (int)POOL_SIZE; ++x) + for(; x < (int)POOL_SIZE_X; ++x) { uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0)); int data0 = convert_int(data); @@ -113,7 +113,7 @@ __kernel void pooling_layer_N_quantized( res = POOL_OP(res, sdata); #if defined(POOL_AVG) - res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))); + res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))); #endif /* defined(POOL_AVG) */ // Store result diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index 043a4bde04..bc5ff73b63 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -63,13 +63,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c "Unsupported combination of parameters!"); const bool is_global_pooling = pool_info.is_global_pooling(); - const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width; + const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width; + const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()), - "Global pooling is supported only with rectangular inputs!"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size_x) || (pool_info.pad_stride_info().pad().second >= pool_size_y)), "Invalid pool size and pool pad combination!"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_size().width != pool_info.pool_size().height, "Invalid Pool size, width not equal to height!"); // Checks performed when output is configured if(output->total_size() != 0) @@ -81,8 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c unsigned int pooled_h = 0; std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), input->dimension(1), - pool_size, - pool_size, + pool_size_x, + pool_size_y, pool_info.pad_stride_info()); ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h), "Invalid output pooling dimensions!"); @@ -99,21 +97,19 @@ std::tuple validate_and_configure_window(ITenso int pool_stride_y = 0; unsigned int pooled_w = 0; unsigned int pooled_h = 0; - int pool_size = pool_info.pool_size().width; + int pool_size_x = pool_info.is_global_pooling() ? input->dimension(0) : pool_info.pool_size().width; + int pool_size_y = pool_info.is_global_pooling() ? input->dimension(1) : pool_info.pool_size().height; const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - // Update pool size in case of global pooling - pool_size = pool_info.is_global_pooling() ? input->dimension(0) : pool_size; - // Check output dimensions std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), input->dimension(1), - pool_size, - pool_size, + pool_size_x, + pool_size_y, pad_stride_info); auto_init(input, output, pooled_w, pooled_h); @@ -126,23 +122,23 @@ std::tuple validate_and_configure_window(ITenso // Change the number of elements processed per iteration // for pooling 3x3 with stride less equal than 3 - const bool can_optimize = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); + const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); const unsigned int num_elems_processed_per_iteration = can_optimize ? 4 : 1; - const int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size; + const int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x; // Number of iterations in X dimension const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; // Upper limit for the number of right/bottom border elements that are accessed const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size_y) - input_height; border_size.right = std::max(upper_bound_w, pool_pad_x); border_size.bottom = std::max(upper_bound_h, pool_pad_y); Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowRectangle input_access(input, -pool_pad_x, -pool_pad_y, num_elems_read_per_iteration, pool_size, + AccessWindowRectangle input_access(input, -pool_pad_x, -pool_pad_y, num_elems_read_per_iteration, pool_size_y, pool_stride_x * num_elems_processed_per_iteration, pool_stride_y); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); bool window_changed = update_window_and_padding(win, input_access, output_access); @@ -172,7 +168,8 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int pooled_w = 0; unsigned int pooled_h = 0; const PoolingType pool_type = pool_info.pool_type(); - int pool_size = pool_info.pool_size().width; + const int pool_size_x = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_info.pool_size().width; + const int pool_size_y = pool_info.is_global_pooling() ? input->info()->dimension(1) : pool_info.pool_size().height; const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); const bool exclude_padding = pool_info.exclude_padding(); std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); @@ -180,14 +177,11 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - // Update pool size in case of global pooling - pool_size = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_size; - // Check output dimensions std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), - pool_size, - pool_size, + pool_size_x, + pool_size_y, pad_stride_info); auto_init(input->info(), output->info(), pooled_w, pooled_h); @@ -220,22 +214,23 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, } // Create kernel - if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type)) + if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type)) { // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where // each thread computes 4 output elements - const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type); + const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type); std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") - + support::cpp11::to_string(pool_size); + + support::cpp11::to_string(pool_size_x); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } else // Run general case { - build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)); + build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x)); + build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); build_opts.add_option_if(data_type == DataType::F16, "-DFP16"); - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N"; + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized" : "pooling_layer_MxN"; _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } -- cgit v1.2.1