aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
diff options
context:
space:
mode:
authorIsabella Gottardi <isabella.gottardi@arm.com>2018-01-30 15:26:00 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:47:18 +0000
commit7567f5f1919f69ea00c2cd5bdca65b67dfe6b388 (patch)
tree990f34d88c00dfeb1c913eb24cc0ee7de4fdc4c3 /src/core/NEON/kernels/NEPoolingLayerKernel.cpp
parent3cfd237ec976d109989c530f5aa726d1e94963e9 (diff)
downloadComputeLibrary-7567f5f1919f69ea00c2cd5bdca65b67dfe6b388.tar.gz
COMPMID-828 - Add support for pool widths 4, 5 & 6 and for non square data sizes - Part 2 (NEON)
Change-Id: I64bc8e3f71236edb71494f431ee34077eb8814ca Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118203 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEPoolingLayerKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp975
1 files changed, 653 insertions, 322 deletions
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index a3ab8a361f..b6af51733a 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -60,13 +60,13 @@ void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int poole
}
template <bool exclude_padding>
-inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
+inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
const int pad_x, const int pad_y, const int stride_x, const int stride_y)
{
int start_x = id.x() * stride_x - pad_x;
int start_y = id.y() * stride_y - pad_y;
- const int end_x = std::min(start_x + pool_size, upper_bound_w);
- const int end_y = std::min(start_y + pool_size, upper_bound_h);
+ const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
if(exclude_padding)
{
start_x = std::max(0, start_x);
@@ -151,26 +151,25 @@ inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offs
v = vsetq_lane_u16(elems[7], v, 7);
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size_x)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- PoolingType pool_type = pool_info.pool_type();
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
- const bool exclude_padding = pool_info.exclude_padding();
- const bool is_global_pooling = pool_info.is_global_pooling();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ PoolingType pool_type = pool_info.pool_type();
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+ const bool exclude_padding = pool_info.exclude_padding();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
static const std::set<int> supported_pool_sizes = { 2, 3 };
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
- ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
+
+ ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8))
+ && (pool_type != PoolingType::MAX));
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x != pool_size_y);
if(output->total_size() != 0)
{
@@ -182,19 +181,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
+Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
{
- const bool is_global_pooling = pool_info.is_global_pooling();
- ARM_COMPUTE_UNUSED(pool_size);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
- "Global pooling is supported only with rectangular inputs!");
+ ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
return Status{};
}
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
BorderSize &border_size,
- unsigned int pooled_w, unsigned int pooled_h, int pool_size)
+ unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
{
unsigned int num_elems_read_per_iteration = 0;
unsigned int num_elems_horizontal_window = 0;
@@ -204,122 +201,124 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const int input_height = input->dimension(1);
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_pad_right = pad_stride_info.pad_right();
- const int pool_pad_top = pad_stride_info.pad_top();
- const int pool_pad_left = pad_stride_info.pad_left();
- const int pool_pad_bottom = pad_stride_info.pad_bottom();
-
+ const int pool_pad_right = pad_stride_info.pad_right();
+ const int pool_pad_top = pad_stride_info.pad_top();
+ const int pool_pad_left = pad_stride_info.pad_left();
+ const int pool_pad_bottom = pad_stride_info.pad_bottom();
+ const bool is_square = pool_size_x == pool_size_y;
// Check output dimensions
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
input->dimension(1),
- pool_size,
- pool_size,
+ pool_size_x,
+ pool_size_y,
pad_stride_info);
- // Select element size
- switch(input->data_type())
+ //If it's not squared and optimized will be executed the MxN
+ num_elems_read_per_iteration = 1;
+ num_elems_processed_per_iteration = 1;
+ num_elems_horizontal_window = 1;
+
+ if(is_square)
{
- case DataType::QS8:
- num_elems_read_per_iteration = 16;
- switch(pool_size)
- {
- case 2:
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
- break;
- case 3:
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
- break;
- default:
- ARM_COMPUTE_ERROR("Pooling size not supported");
- break;
- }
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- break;
- case DataType::QASYMM8:
- switch(pool_size)
- {
- case 2:
- num_elems_read_per_iteration = 16;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- break;
- case 3:
- num_elems_read_per_iteration = 16;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- break;
- default:
- num_elems_read_per_iteration = 1;
- num_elems_processed_per_iteration = 1;
- num_elems_horizontal_window = 1;
- break;
- }
- break;
- case DataType::QS16:
- num_elems_read_per_iteration = 8;
- switch(pool_size)
- {
- case 2:
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
- break;
- case 3:
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
- break;
- default:
- ARM_COMPUTE_ERROR("Pooling size not supported");
- }
- num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
- break;
+ switch(input->data_type())
+ {
+ case DataType::QS8:
+ num_elems_read_per_iteration = 16;
+ switch(pool_size_x)
+ {
+ case 2:
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+ break;
+ case 3:
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+ break;
+ default:
+ break;
+ }
+ break;
+ case DataType::QASYMM8:
+ switch(pool_size_x)
+ {
+ case 2:
+ num_elems_read_per_iteration = 16;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
+ break;
+ case 3:
+ num_elems_read_per_iteration = 16;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
+ break;
+ default:
+ break;
+ }
+ break;
+ case DataType::QS16:
+ num_elems_read_per_iteration = 8;
+ switch(pool_size_x)
+ {
+ case 2:
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
+ break;
+ case 3:
+ num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
+ num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
+ break;
+ default:
+ break;
+ }
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- switch(pool_size)
- {
- case 2:
- num_elems_read_per_iteration = 16;
- num_elems_processed_per_iteration = 8;
- num_elems_horizontal_window = 8;
- break;
- case 3:
- num_elems_read_per_iteration = 4;
- num_elems_processed_per_iteration = 1;
- num_elems_horizontal_window = 1;
- break;
- default:
- ARM_COMPUTE_ERROR("Pooling size not supported");
- break;
- }
- break;
+ case DataType::F16:
+ switch(pool_size_x)
+ {
+ case 2:
+ num_elems_read_per_iteration = 16;
+ num_elems_processed_per_iteration = 8;
+ num_elems_horizontal_window = 8;
+ break;
+ case 3:
+ num_elems_read_per_iteration = 4;
+ num_elems_processed_per_iteration = 1;
+ num_elems_horizontal_window = 1;
+ break;
+ default:
+ break;
+ }
+ break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- switch(pool_size)
- {
- case 2:
- num_elems_read_per_iteration = 2;
- break;
- case 3:
- num_elems_read_per_iteration = 4; // We use vload4 for pooling3
- break;
- case 7:
- num_elems_read_per_iteration = 8; // We use vload8 for pooling7
- break;
- default:
- num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
- break;
- }
- num_elems_processed_per_iteration = 1;
- num_elems_horizontal_window = 1;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
+ case DataType::F32:
+ switch(pool_size_x)
+ {
+ case 2:
+ num_elems_read_per_iteration = 2;
+ break;
+ case 3:
+ num_elems_read_per_iteration = 4; // We use vload4 for pooling3
+ break;
+ case 7:
+ num_elems_read_per_iteration = 8; // We use vload8 for pooling7
+ break;
+ default:
+ break;
+ }
+ num_elems_processed_per_iteration = 1;
+ num_elems_horizontal_window = 1;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Element size not supported");
+ break;
+ }
}
-
// Number of iterations in X dimension
const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
// Upper limit for the number of right/bottom border elements that are accessed
const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size) - input_height;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
border_size.right = std::max(upper_bound_w, pool_pad_right);
@@ -351,7 +350,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
NEPoolingLayerKernel::NEPoolingLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
{
}
@@ -371,29 +370,31 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
const int pool_stride_x = pad_stride_info.stride().first;
// Update pool size in case of global pooling
- const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width;
+ const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width;
+ const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height;
// Validate pool info before calling scaled_dimensions
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
// Check output dimensions
unsigned int pooled_w, pooled_h;
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
input->info()->dimension(1),
- pool_size,
- pool_size,
+ pool_size_x,
+ pool_size_y,
pad_stride_info);
// Output auto initialization if not yet initialized
auto_init(input->info(), output->info(), pooled_w, pooled_h);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size, pool_size));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size_x));
// Set instance variables
_input = input;
_output = output;
_pool_info = pool_info;
+ _is_square = (pool_size_x == pool_size_y);
// Get data type
const DataType data_type = input->info()->data_type();
@@ -401,41 +402,63 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
// Select appropriate function
if(data_type == DataType::QS8)
{
- switch(pool_size)
+ if(_is_square)
{
- case 2:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 3:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling size!");
+ switch(pool_size_x)
+ {
+ case 2:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ case 3:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ default:
+ switch(pool_type)
+ {
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ }
+ }
+ else
+ {
+ switch(pool_type)
+ {
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
}
else if(data_type == DataType::QASYMM8)
{
- if(pool_size == 2 && pool_stride_x < 3)
+ if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
{
switch(pool_type)
{
@@ -449,7 +472,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
ARM_COMPUTE_ERROR("Unsupported pooling type!");
}
}
- else if(pool_size == 3 && pool_stride_x < 3)
+ else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square)
{
switch(pool_type)
{
@@ -468,10 +491,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
switch(pool_type)
{
case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, false>;
break;
case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
+ _func = &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::MAX>;
break;
default:
ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -480,151 +503,227 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
}
else if(data_type == DataType::QS16)
{
- switch(pool_size)
+ if(_is_square)
{
- case 2:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 3:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling size!");
+ switch(pool_size_x)
+ {
+ case 2:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ case 3:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ default:
+ switch(pool_type)
+ {
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ }
+ }
+ else
+ {
+ switch(pool_type)
+ {
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
}
else if(data_type == DataType::F16)
{
- switch(pool_size)
+ if(_is_square)
{
- case 2:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
- break;
- case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 3:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
- break;
- case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling size!");
+ switch(pool_size_x)
+ {
+ case 2:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ case 3:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ default:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ }
+ }
+ else
+ {
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
}
else if(data_type == DataType::F32)
{
- switch(pool_size)
+ if(_is_square)
{
- case 2:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
- break;
- case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 3:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
- break;
- case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 7:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
- break;
- case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- default:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
- break;
- case PoolingType::L2:
- _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
+ switch(pool_size_x)
+ {
+ case 2:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ case 3:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ case 7:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ default:
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
+ break;
+ }
+ }
+ else
+ {
+ switch(pool_type)
+ {
+ case PoolingType::AVG:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+ break;
+ case PoolingType::L2:
+ _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+ break;
+ case PoolingType::MAX:
+ _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported pooling type!");
+ }
}
}
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
+ auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
@@ -913,7 +1012,7 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float16x4_t scale_v = vdup_n_f16(scale);
// Perform pooling
const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -979,7 +1078,7 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window
if(pooling_type != PoolingType::MAX)
{
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float16x8_t scale_v = vdupq_n_f16(scale);
res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
}
@@ -1042,7 +1141,7 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
@@ -1375,7 +1474,7 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
@@ -1433,7 +1532,7 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float scale = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
@@ -1486,13 +1585,241 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window
input, output);
}
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ qint8x16_t vres = {};
+ qint8_t res = {};
+
+ //PoolingType::MAX
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ int x = 0;
+ for(; x <= (pool_size_x - 16); x += 16)
+ {
+ const qint8x16_t data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+ vres = vmaxq_s8(vres, data);
+ }
+
+ // Leftover for loop
+ for(; x < pool_size_x; ++x)
+ {
+ qint8_t data = *(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+ res = std::max(res, data);
+ }
+ }
+ //Reduce
+ const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres));
+ res = std::max(res, vget_lane_s8(half_vres, 0));
+ res = std::max(res, vget_lane_s8(half_vres, 1));
+ res = std::max(res, vget_lane_s8(half_vres, 2));
+ res = std::max(res, vget_lane_s8(half_vres, 3));
+ res = std::max(res, vget_lane_s8(half_vres, 4));
+ res = std::max(res, vget_lane_s8(half_vres, 5));
+ res = std::max(res, vget_lane_s8(half_vres, 6));
+ res = std::max(res, vget_lane_s8(half_vres, 7));
+
+ // Store result
+ *(reinterpret_cast<qint8_t *>(output.ptr())) = res;
+ },
+ input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ qint16x8_t vres = {};
+ qint16_t res = {};
+
+ //PoolingType::MAX
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ int x = 0;
+ for(; x <= (pool_size_x - 8); x += 8)
+ {
+ const qint16x8_t data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+ vres = vmaxq_s16(vres, data);
+ }
+
+ // Leftover for loop
+ for(; x < pool_size_x; ++x)
+ {
+ qint16_t data = *(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+ res = std::max(res, data);
+ }
+ }
+ //Reduce
+ const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres));
+ res = std::max(res, vget_lane_s16(half_vres, 0));
+ res = std::max(res, vget_lane_s16(half_vres, 1));
+ res = std::max(res, vget_lane_s16(half_vres, 2));
+ res = std::max(res, vget_lane_s16(half_vres, 3));
+
+ // Store result
+ *(reinterpret_cast<qint16_t *>(output.ptr())) = res;
+ },
+ input, output);
+}
+
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window)
{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Iterator input(_input, window_input);
Iterator output(_output, window);
- const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+ const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
+ const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
+ const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+ const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+ const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ float16_t res = 0.0f;
+ float16x8_t vres = vdupq_n_f16(0.0f);
+
+ if(pooling_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+ // Perform pooling
+
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ int x = 0;
+ for(; x <= (pool_size_x - 8); x += 8)
+ {
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if(pooling_type == PoolingType::L2)
+ {
+ vres = vaddq_f16(vres, vmulq_f16(data, data));
+ }
+ else
+ {
+ vres = vaddq_f16(vres, data);
+ }
+ }
+
+ // Leftover for loop
+ for(; x < pool_size_x; ++x)
+ {
+ float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+
+ // Get power of 2 in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ data *= data;
+ }
+
+ res += data;
+ }
+ }
+
+ // Reduction
+ float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
+ res += vget_lane_f16(tmp, 0);
+ res += vget_lane_f16(tmp, 1);
+ res += vget_lane_f16(tmp, 2);
+ res += vget_lane_f16(tmp, 3);
+
+ // Divide by scale
+ res *= scale;
+ }
+ else
+ {
+ float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+ res = std::numeric_limits<float>::lowest();
+
+ for(int y = 0; y < pool_size_y; ++y)
+ {
+ int x = 0;
+ for(; x <= (pool_size_x - 8); x += 8)
+ {
+ const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+ (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+ vres = vmaxq_f16(vres, data);
+ }
+
+ // Leftover for loop
+ for(; x < pool_size_x; ++x)
+ {
+ const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+ res = std::max(res, data);
+ }
+ }
+
+ float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
+ res = std::max(res, vget_lane_f16(tmp, 0));
+ res = std::max(res, vget_lane_f16(tmp, 1));
+ res = std::max(res, vget_lane_f16(tmp, 2));
+ res = std::max(res, vget_lane_f16(tmp, 3));
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if(pooling_type == PoolingType::L2)
+ {
+ res = std::sqrt(res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float16_t *>(output.ptr())) = res;
+ },
+ input, output);
+
+#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ ARM_COMPUTE_UNUSED(window_input);
+ ARM_COMPUTE_UNUSED(window);
+ ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window)
+{
+ Iterator input(_input, window_input);
+ Iterator output(_output, window);
+
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
@@ -1510,15 +1837,15 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window
if(pooling_type != PoolingType::MAX)
{
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
float32x4_t vres = vdupq_n_f32(0.0f);
- for(int y = 0; y < pool_size; ++y)
+ for(int y = 0; y < pool_size_y; ++y)
{
int x = 0;
- for(; x <= (pool_size - 4); x += 4)
+ for(; x <= (pool_size_x - 4); x += 4)
{
const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
@@ -1535,7 +1862,7 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window
}
// Leftover for loop
- for(; x < pool_size; ++x)
+ for(; x < pool_size_x; ++x)
{
float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
@@ -1564,13 +1891,13 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window
}
else
{
- float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
- res = std::numeric_limits<float>::min();
+ float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
+ res = std::numeric_limits<float>::lowest();
- for(int y = 0; y < pool_size; ++y)
+ for(int y = 0; y < pool_size_y; ++y)
{
int x = 0;
- for(; x <= (pool_size - 4); x += 4)
+ for(; x <= (pool_size_x - 4); x += 4)
{
const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
@@ -1578,7 +1905,7 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window
}
// Leftover for loop
- for(; x < pool_size; ++x)
+ for(; x < pool_size_x; ++x)
{
const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
res = std::max(res, data);
@@ -1609,12 +1936,13 @@ void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window
}
template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window)
{
Iterator input(_input, window_input);
Iterator output(_output, window);
- const int pool_size = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+ const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
@@ -1635,13 +1963,13 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi
uint32_t sres = 0;
// Calculate scale
- const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
// Perform pooling
- for(int y = 0; y < pool_size; ++y)
+ for(int y = 0; y < pool_size_y; ++y)
{
int x = 0;
- for(; x <= (pool_size - 8); x += 8)
+ for(; x <= (pool_size_x - 8); x += 8)
{
const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
@@ -1651,7 +1979,7 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi
}
// Leftover for loop
- for(; x < pool_size; ++x)
+ for(; x < pool_size_x; ++x)
{
uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
sres += data;
@@ -1670,10 +1998,10 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi
uint8x8_t vres = vdup_n_u8(0);
res = 0;
- for(int y = 0; y < pool_size; ++y)
+ for(int y = 0; y < pool_size_y; ++y)
{
int x = 0;
- for(; x <= (pool_size - 8); x += 8)
+ for(; x <= (pool_size_x - 8); x += 8)
{
const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
(y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
@@ -1681,7 +2009,7 @@ void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Wi
}
// Leftover for loop
- for(; x < pool_size; ++x)
+ for(; x < pool_size_x; ++x)
{
const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
res = std::max(res, data);
@@ -1713,20 +2041,23 @@ Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInf
BorderSize border_size(0);
const bool is_global_pooling = pool_info.is_global_pooling();
- const unsigned int pool_size = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
+ const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
+ const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
- // Validate pool info befor calling scaled_dimensions
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
+ // Validate pool info before calling scaled_dimensions
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
// Check output dimensions
std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
input->dimension(1),
- pool_size,
- pool_size,
+ pool_size_x,
+ pool_size_y,
pool_info.pad_stride_info());
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size, pool_size));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size_x));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
+ pool_size_x, pool_size_y)
+ .first);
return Status{};
}