aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLPoolingLayerKernel.cpp
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2020-10-08 10:25:49 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2020-10-19 12:18:07 +0000
commit7333e1f10f5da9dc67b511d326121a843771a107 (patch)
tree9b36047d0f13846406680bc01a2cb8df8660de66 /src/core/CL/kernels/CLPoolingLayerKernel.cpp
parente4558b501bc4a8e4e731517916a29fb1594d2a78 (diff)
downloadComputeLibrary-7333e1f10f5da9dc67b511d326121a843771a107.tar.gz
COMPMID-3732: Remove OpenCL padding from CLPoolingLayer
- Refactor pooling layer kernels on OpenCL (F32/F16/QASYMM8) to avoid padding and improve performance - Add test for checking zero padding requirement - Fix issue with extracting the index. The issue was caused by the padding passed at compile time - auto_init indices tensor in CLPoolingLayerKernel Change-Id: I1ae5a2ef8c4ce787c80dcd73e35c17bb34623cb5 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4188 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLPoolingLayerKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp134
1 files changed, 71 insertions, 63 deletions
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index d60e196b7f..1771834aac 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -50,10 +50,14 @@ namespace
// Internal window config info
using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info)
+void auto_init(const ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, PoolingLayerInfo pool_info)
{
TensorShape out_shape = compute_pool_shape(*input, pool_info);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
+ if(indices)
+ {
+ auto_init_if_empty(*indices, input->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+ }
}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
@@ -63,16 +67,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
"Unsupported combination of parameters!");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
- && (input->data_layout() == DataLayout::NHWC),
- "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+
// Check indices
if(indices)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+
+ if(indices->total_size() != 0)
+ {
+ TensorInfo idx_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, DataType::U32));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
+ }
}
// Checks performed when output is configured
@@ -108,9 +115,9 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
const int pool_pad_top = pad_stride_info.pad_top();
const int pool_pad_left = pad_stride_info.pad_left();
const int pool_pad_bottom = pad_stride_info.pad_bottom();
- BorderSize border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+ BorderSize border_size = BorderSize();
- auto_init(input, output, pool_info);
+ auto_init(input, output, indices, pool_info);
pooled_w = output->tensor_shape()[idx_width];
pooled_h = output->tensor_shape()[idx_height];
@@ -126,6 +133,8 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
{
case DataLayout::NCHW:
{
+ // Initialize border size
+ border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
// Change the number of elements processed per iteration
// for pooling 3x3 with stride less equal than 3
const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
@@ -165,27 +174,17 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
}
case DataLayout::NHWC:
{
- num_elems_processed_per_iteration = 8;
+ // Initialize border size
+ border_size = BorderSize();
+ num_elems_processed_per_iteration = adjust_vec_size(4, output->dimension(0));
win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(input,
- 0, -1,
- ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- // Update indices window
- if(indices)
+ if(indices != nullptr)
{
- AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
- indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
- }
- else
- {
- window_changed = update_window_and_padding(win, input_access, output_access);
+ indices->set_valid_region(ValidRegion(Coordinates(), indices->tensor_shape()));
}
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
break;
}
default:
@@ -228,6 +227,7 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
const int pool_size_x = pool_info.is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width;
const int pool_size_y = pool_info.is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height;
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
@@ -246,17 +246,11 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
ICLKernel::configure_internal(std::get<1>(win_config));
- if(_data_layout == DataLayout::NCHW)
- {
- CLPoolingConfig pooling_config = std::get<2>(win_config);
- _num_elems_processed_per_iteration = pooling_config.first;
- _border_size = pooling_config.second;
- }
- else
- {
- _border_size = BorderSize(1, 0, 0, 0);
- _num_elems_processed_per_iteration = 8;
- }
+ CLPoolingConfig pooling_config = std::get<2>(win_config);
+ _num_elems_processed_per_iteration = pooling_config.first;
+ _border_size = pooling_config.second;
+
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
// Tensor paddings are used to calculate the indicies for MAX pooling
if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
@@ -282,7 +276,8 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
}
// Check output dimensions
- auto_init(input->info(), output->info(), pool_info);
+ auto_init(input->info(), output->info(), indices ? indices->info() : nullptr, pool_info);
+
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -314,19 +309,20 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
build_opts.add_option("-DINITIAL_VALUE=0");
}
- const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
- const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
- const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
- build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
- build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+ build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+ build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
// Create kernel
switch(_data_layout)
{
case DataLayout::NCHW:
{
- build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
- build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+ const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
+ const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+ const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
+ build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+ build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
if(pool_type != PoolingType::MAX)
{
build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
@@ -365,26 +361,38 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
}
case DataLayout::NHWC:
{
+ // Floating point mixed precision is support on F16 only
+ const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+ // Wider accumulation is required to avoid accuracy loss
+ // Case 1: Floating point mixed precision (fp16 input data and fp32 accumulation)
+ // Cast 2: Quantized (int8/uint8 input data and int32 accumulation )
+ DataType acc_data_type = data_type;
+
+ if(use_fp_mixed_precision)
+ {
+ acc_data_type = DataType::F32;
+ }
+ else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+ {
+ acc_data_type = DataType::S32;
+ }
+
+ build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+ build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
- build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
- build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
- build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
- "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
- build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
- "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3)));
-
- if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+ build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
+ build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch_size)));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % _num_elems_processed_per_iteration));
+ if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
{
- if(data_type == DataType::F32)
- {
- std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- }
- else if(data_type == DataType::F16)
- {
- std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16";
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- }
+ build_opts.add_option_if(_indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
+
+ std::string kernel_name = "pooling_layer_2x2_nhwc";
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
}
else
{
@@ -452,7 +460,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, in_slice);
add_3D_tensor_argument(idx, _output, slice);
- if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+ if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
{
add_3D_tensor_argument(idx, _indices, slice);
}
@@ -463,14 +471,14 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
}
case DataLayout::NHWC:
{
- const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3);
+ const size_t batch_size = _output->info()->tensor_shape().total_size_upper(3);
Window slice = window_collapsed.first_slice_window_4D();
Window in_slice = window_collapsed.first_slice_window_4D();
in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
- in_slice.set(3, Window::Dimension(0, total_batches, 1));
+ in_slice.set(3, Window::Dimension(0, batch_size, 1));
do
{
// Set inputs