From 7567f5f1919f69ea00c2cd5bdca65b67dfe6b388 Mon Sep 17 00:00:00 2001 From: Isabella Gottardi Date: Tue, 30 Jan 2018 15:26:00 +0000 Subject: COMPMID-828 - Add support for pool widths 4, 5 & 6 and for non square data sizes - Part 2 (NEON) Change-Id: I64bc8e3f71236edb71494f431ee34077eb8814ca Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118203 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 975 +++++++++++++++++-------- 1 file changed, 653 insertions(+), 322 deletions(-) (limited to 'src/core/NEON/kernels/NEPoolingLayerKernel.cpp') diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index a3ab8a361f..b6af51733a 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -60,13 +60,13 @@ void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int poole } template -inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h, +inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { int start_x = id.x() * stride_x - pad_x; int start_y = id.y() * stride_y - pad_y; - const int end_x = std::min(start_x + pool_size, upper_bound_w); - const int end_y = std::min(start_y + pool_size, upper_bound_h); + const int end_x = std::min(start_x + pool_size_x, upper_bound_w); + const int end_y = std::min(start_y + pool_size_y, upper_bound_h); if(exclude_padding) { start_x = std::max(0, start_x); @@ -151,26 +151,25 @@ inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offs v = vsetq_lane_u16(elems[7], v, 7); } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size_x) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - int pool_stride_x = 0; - int pool_stride_y = 0; - PoolingType pool_type = pool_info.pool_type(); - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); - const bool exclude_padding = pool_info.exclude_padding(); - const bool is_global_pooling = pool_info.is_global_pooling(); + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + const bool exclude_padding = pool_info.exclude_padding(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); static const std::set supported_pool_sizes = { 2, 3 }; ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8))); - ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y())); + + ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)) + && (pool_type != PoolingType::MAX)); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2); ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x != pool_size_y); if(output->total_size() != 0) { @@ -182,19 +181,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size) +Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y) { - const bool is_global_pooling = pool_info.is_global_pooling(); - ARM_COMPUTE_UNUSED(pool_size); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()), - "Global pooling is supported only with rectangular inputs!"); + ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0); + ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0); return Status{}; } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration, BorderSize &border_size, - unsigned int pooled_w, unsigned int pooled_h, int pool_size) + unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y) { unsigned int num_elems_read_per_iteration = 0; unsigned int num_elems_horizontal_window = 0; @@ -204,122 +201,124 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen const int input_height = input->dimension(1); const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_right = pad_stride_info.pad_right(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - const int pool_pad_bottom = pad_stride_info.pad_bottom(); - + const int pool_pad_right = pad_stride_info.pad_right(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + const int pool_pad_bottom = pad_stride_info.pad_bottom(); + const bool is_square = pool_size_x == pool_size_y; // Check output dimensions std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), input->dimension(1), - pool_size, - pool_size, + pool_size_x, + pool_size_y, pad_stride_info); - // Select element size - switch(input->data_type()) + //If it's not squared and optimized will be executed the MxN + num_elems_read_per_iteration = 1; + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + + if(is_square) { - case DataType::QS8: - num_elems_read_per_iteration = 16; - switch(pool_size) - { - case 2: - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; - break; - case 3: - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; - break; - default: - ARM_COMPUTE_ERROR("Pooling size not supported"); - break; - } - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - break; - case DataType::QASYMM8: - switch(pool_size) - { - case 2: - num_elems_read_per_iteration = 16; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - break; - case 3: - num_elems_read_per_iteration = 16; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - break; - default: - num_elems_read_per_iteration = 1; - num_elems_processed_per_iteration = 1; - num_elems_horizontal_window = 1; - break; - } - break; - case DataType::QS16: - num_elems_read_per_iteration = 8; - switch(pool_size) - { - case 2: - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7; - break; - case 3: - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6; - break; - default: - ARM_COMPUTE_ERROR("Pooling size not supported"); - } - num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8; - break; + switch(input->data_type()) + { + case DataType::QS8: + num_elems_read_per_iteration = 16; + switch(pool_size_x) + { + case 2: + num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; + break; + case 3: + num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; + break; + default: + break; + } + break; + case DataType::QASYMM8: + switch(pool_size_x) + { + case 2: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; + num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; + break; + case 3: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; + num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; + break; + default: + break; + } + break; + case DataType::QS16: + num_elems_read_per_iteration = 8; + switch(pool_size_x) + { + case 2: + num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7; + break; + case 3: + num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6; + break; + default: + break; + } + break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - switch(pool_size) - { - case 2: - num_elems_read_per_iteration = 16; - num_elems_processed_per_iteration = 8; - num_elems_horizontal_window = 8; - break; - case 3: - num_elems_read_per_iteration = 4; - num_elems_processed_per_iteration = 1; - num_elems_horizontal_window = 1; - break; - default: - ARM_COMPUTE_ERROR("Pooling size not supported"); - break; - } - break; + case DataType::F16: + switch(pool_size_x) + { + case 2: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = 8; + num_elems_horizontal_window = 8; + break; + case 3: + num_elems_read_per_iteration = 4; + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + break; + default: + break; + } + break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - switch(pool_size) - { - case 2: - num_elems_read_per_iteration = 2; - break; - case 3: - num_elems_read_per_iteration = 4; // We use vload4 for pooling3 - break; - case 7: - num_elems_read_per_iteration = 8; // We use vload8 for pooling7 - break; - default: - num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop - break; - } - num_elems_processed_per_iteration = 1; - num_elems_horizontal_window = 1; - break; - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; + case DataType::F32: + switch(pool_size_x) + { + case 2: + num_elems_read_per_iteration = 2; + break; + case 3: + num_elems_read_per_iteration = 4; // We use vload4 for pooling3 + break; + case 7: + num_elems_read_per_iteration = 8; // We use vload8 for pooling7 + break; + default: + break; + } + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } } - // Number of iterations in X dimension const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; // Upper limit for the number of right/bottom border elements that are accessed const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size) - input_height; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height; border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); border_size.right = std::max(upper_bound_w, pool_pad_right); @@ -351,7 +350,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } // namespace NEPoolingLayerKernel::NEPoolingLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0) + : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false) { } @@ -371,29 +370,31 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons const int pool_stride_x = pad_stride_info.stride().first; // Update pool size in case of global pooling - const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width; + const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width; + const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height; // Validate pool info before calling scaled_dimensions - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y)); // Check output dimensions unsigned int pooled_w, pooled_h; std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), - pool_size, - pool_size, + pool_size_x, + pool_size_y, pad_stride_info); // Output auto initialization if not yet initialized auto_init(input->info(), output->info(), pooled_w, pooled_h); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size, pool_size)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size_x)); // Set instance variables _input = input; _output = output; _pool_info = pool_info; + _is_square = (pool_size_x == pool_size_y); // Get data type const DataType data_type = input->info()->data_type(); @@ -401,41 +402,63 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons // Select appropriate function if(data_type == DataType::QS8) { - switch(pool_size) + if(_is_square) { - case 2: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling2_q8; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_q8; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 3: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling3_q8; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_q8; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling size!"); + switch(pool_size_x) + { + case 2: + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling2_q8; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_q8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + case 3: + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling3_q8; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_q8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + default: + switch(pool_type) + { + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_q8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + } + } + else + { + switch(pool_type) + { + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_q8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } } else if(data_type == DataType::QASYMM8) { - if(pool_size == 2 && pool_stride_x < 3) + if(pool_size_x == 2 && pool_stride_x < 3 && _is_square) { switch(pool_type) { @@ -449,7 +472,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons ARM_COMPUTE_ERROR("Unsupported pooling type!"); } } - else if(pool_size == 3 && pool_stride_x < 3) + else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square) { switch(pool_type) { @@ -468,10 +491,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8 : &NEPoolingLayerKernel::poolingN_qasymm8; + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8 : &NEPoolingLayerKernel::poolingMxN_qasymm8; break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingN_qasymm8; + _func = &NEPoolingLayerKernel::poolingMxN_qasymm8; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -480,151 +503,227 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons } else if(data_type == DataType::QS16) { - switch(pool_size) + if(_is_square) { - case 2: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling2_q16; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_q16; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 3: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling3_q16; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_q16; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling size!"); + switch(pool_size_x) + { + case 2: + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling2_q16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_q16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + case 3: + switch(pool_type) + { + case PoolingType::AVG: + _func = &NEPoolingLayerKernel::pooling3_q16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_q16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + default: + switch(pool_type) + { + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_q16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + } + } + else + { + switch(pool_type) + { + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_q16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } } else if(data_type == DataType::F16) { - switch(pool_size) + if(_is_square) { - case 2: - switch(pool_type) - { - case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; - break; - case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_f16; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 3: - switch(pool_type) - { - case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; - break; - case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_f16; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling size!"); + switch(pool_size_x) + { + case 2: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_f16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + case 3: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_f16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + default: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_f16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + } + } + else + { + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_f16; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } } else if(data_type == DataType::F32) { - switch(pool_size) + if(_is_square) { - case 2: - switch(pool_type) - { - case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; - break; - case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_f32; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 3: - switch(pool_type) - { - case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; - break; - case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_f32; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 7: - switch(pool_type) - { - case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; - break; - case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling7_f32; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - default: - switch(pool_type) - { - case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32 : &NEPoolingLayerKernel::poolingN_f32; - break; - case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32 : &NEPoolingLayerKernel::poolingN_f32; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingN_f32; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; + switch(pool_size_x) + { + case 2: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling2_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + case 3: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling3_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + case 7: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::pooling7_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + default: + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } + break; + } + } + else + { + switch(pool_type) + { + case PoolingType::AVG: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + break; + case PoolingType::L2: + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + break; + case PoolingType::MAX: + _func = &NEPoolingLayerKernel::poolingMxN_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling type!"); + } } } // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size); + auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } @@ -913,7 +1012,7 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float16x4_t scale_v = vdup_n_f16(scale); // Perform pooling const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); @@ -979,7 +1078,7 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { - const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float16x8_t scale_v = vdupq_n_f16(scale); res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1])))); } @@ -1042,7 +1141,7 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling @@ -1375,7 +1474,7 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling @@ -1433,7 +1532,7 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling @@ -1486,13 +1585,241 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window input, output); } +template +void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; + const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); + const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + + execute_window_loop(window, [&](const Coordinates & id) + { + qint8x16_t vres = {}; + qint8_t res = {}; + + //PoolingType::MAX + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 16); x += 16) + { + const qint8x16_t data = vld1q_qs8(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + vres = vmaxq_s8(vres, data); + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + qint8_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + res = std::max(res, data); + } + } + //Reduce + const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres)); + res = std::max(res, vget_lane_s8(half_vres, 0)); + res = std::max(res, vget_lane_s8(half_vres, 1)); + res = std::max(res, vget_lane_s8(half_vres, 2)); + res = std::max(res, vget_lane_s8(half_vres, 3)); + res = std::max(res, vget_lane_s8(half_vres, 4)); + res = std::max(res, vget_lane_s8(half_vres, 5)); + res = std::max(res, vget_lane_s8(half_vres, 6)); + res = std::max(res, vget_lane_s8(half_vres, 7)); + + // Store result + *(reinterpret_cast(output.ptr())) = res; + }, + input, output); +} + +template +void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; + const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); + const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + + execute_window_loop(window, [&](const Coordinates & id) + { + qint16x8_t vres = {}; + qint16_t res = {}; + + //PoolingType::MAX + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const qint16x8_t data = vld1q_qs16(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + vres = vmaxq_s16(vres, data); + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + qint16_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + res = std::max(res, data); + } + } + //Reduce + const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres)); + res = std::max(res, vget_lane_s16(half_vres, 0)); + res = std::max(res, vget_lane_s16(half_vres, 1)); + res = std::max(res, vget_lane_s16(half_vres, 2)); + res = std::max(res, vget_lane_s16(half_vres, 3)); + + // Store result + *(reinterpret_cast(output.ptr())) = res; + }, + input, output); +} + template -void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC Iterator input(_input, window_input); Iterator output(_output, window); - const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; + const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); + const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); + const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); + const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom); + + execute_window_loop(window, [&](const Coordinates & id) + { + float16_t res = 0.0f; + float16x8_t vres = vdupq_n_f16(0.0f); + + if(pooling_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const float16x8_t data = vld1q_f16(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + + // Get power of 2 in case of l2 pooling and accumulate + if(pooling_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + float16_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + + // Get power of 2 in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + data *= data; + } + + res += data; + } + } + + // Reduction + float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres)); + res += vget_lane_f16(tmp, 0); + res += vget_lane_f16(tmp, 1); + res += vget_lane_f16(tmp, 2); + res += vget_lane_f16(tmp, 3); + + // Divide by scale + res *= scale; + } + else + { + float16x8_t vres = vdupq_n_f16(std::numeric_limits::lowest()); + res = std::numeric_limits::lowest(); + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const float16x8_t data = vld1q_f16(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + vres = vmaxq_f16(vres, data); + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + const float16_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); + res = std::max(res, data); + } + } + + float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres)); + res = std::max(res, vget_lane_f16(tmp, 0)); + res = std::max(res, vget_lane_f16(tmp, 1)); + res = std::max(res, vget_lane_f16(tmp, 2)); + res = std::max(res, vget_lane_f16(tmp, 3)); + } + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast(output.ptr())) = res; + }, + input, output); + +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + ARM_COMPUTE_UNUSED(window_input); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a"); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +} + +template +void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); @@ -1510,15 +1837,15 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); // Perform pooling float32x4_t vres = vdupq_n_f32(0.0f); - for(int y = 0; y < pool_size; ++y) + for(int y = 0; y < pool_size_y; ++y) { int x = 0; - for(; x <= (pool_size - 4); x += 4) + for(; x <= (pool_size_x - 4); x += 4) { const float32x4_t data = vld1q_f32(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); @@ -1535,7 +1862,7 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window } // Leftover for loop - for(; x < pool_size; ++x) + for(; x < pool_size_x; ++x) { float data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); @@ -1564,13 +1891,13 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window } else { - float32x4_t vres = vdupq_n_f32(std::numeric_limits::min()); - res = std::numeric_limits::min(); + float32x4_t vres = vdupq_n_f32(std::numeric_limits::lowest()); + res = std::numeric_limits::lowest(); - for(int y = 0; y < pool_size; ++y) + for(int y = 0; y < pool_size_y; ++y) { int x = 0; - for(; x <= (pool_size - 4); x += 4) + for(; x <= (pool_size_x - 4); x += 4) { const float32x4_t data = vld1q_f32(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); @@ -1578,7 +1905,7 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window } // Leftover for loop - for(; x < pool_size; ++x) + for(; x < pool_size_x; ++x) { const float data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); res = std::max(res, data); @@ -1609,12 +1936,13 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window } template -void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); - const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); @@ -1635,13 +1963,13 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi uint32_t sres = 0; // Calculate scale - const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); // Perform pooling - for(int y = 0; y < pool_size; ++y) + for(int y = 0; y < pool_size_y; ++y) { int x = 0; - for(; x <= (pool_size - 8); x += 8) + for(; x <= (pool_size_x - 8); x += 8) { const uint8x8_t data = vld1_u8(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); @@ -1651,7 +1979,7 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi } // Leftover for loop - for(; x < pool_size; ++x) + for(; x < pool_size_x; ++x) { uint8_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); sres += data; @@ -1670,10 +1998,10 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi uint8x8_t vres = vdup_n_u8(0); res = 0; - for(int y = 0; y < pool_size; ++y) + for(int y = 0; y < pool_size_y; ++y) { int x = 0; - for(; x <= (pool_size - 8); x += 8) + for(; x <= (pool_size_x - 8); x += 8) { const uint8x8_t data = vld1_u8(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); @@ -1681,7 +2009,7 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi } // Leftover for loop - for(; x < pool_size; ++x) + for(; x < pool_size_x; ++x) { const uint8_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); res = std::max(res, data); @@ -1713,20 +2041,23 @@ Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInf BorderSize border_size(0); const bool is_global_pooling = pool_info.is_global_pooling(); - const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width; + const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width; + const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height; - // Validate pool info befor calling scaled_dimensions - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size)); + // Validate pool info before calling scaled_dimensions + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y)); // Check output dimensions std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), input->dimension(1), - pool_size, - pool_size, + pool_size_x, + pool_size_y, pool_info.pad_stride_info()); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size, pool_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size_x)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, + pool_size_x, pool_size_y) + .first); return Status{}; } -- cgit v1.2.1