From 57dac8400d56a4b68975d5563a9540c96d49fe5f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 1 Mar 2018 16:03:50 +0000 Subject: COMPMID-806 Add NHWC data format support format for NEON pooling Change-Id: I7ab174c72f3d56134fcec259a137739061fd12e9 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/123065 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- .../core/NEON/kernels/NEPoolingLayerKernel.h | 53 +- .../runtime/NEON/functions/NEPoolingLayer.h | 1 + src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 885 +++++++++++++++++---- src/runtime/NEON/functions/NEPoolingLayer.cpp | 47 +- tests/benchmark/CL/PoolingLayer.cpp | 73 +- tests/benchmark/GLES_COMPUTE/PoolingLayer.cpp | 73 +- tests/benchmark/NEON/PoolingLayer.cpp | 33 +- tests/benchmark/fixtures/PoolingLayerFixture.h | 6 +- tests/datasets/PoolingLayerDataset.h | 3 +- tests/validation/CL/GlobalPoolingLayer.cpp | 11 +- tests/validation/CL/PoolingLayer.cpp | 57 +- .../validation/GLES_COMPUTE/GlobalPoolingLayer.cpp | 11 +- tests/validation/GLES_COMPUTE/PoolingLayer.cpp | 21 +- tests/validation/Helpers.cpp | 3 +- tests/validation/NEON/GlobalPoolingLayer.cpp | 6 +- tests/validation/NEON/PoolingLayer.cpp | 60 +- tests/validation/fixtures/PoolingLayerFixture.h | 37 +- tests/validation/reference/PoolingLayer.cpp | 35 +- 18 files changed, 1083 insertions(+), 332 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h index 8250342b53..4140ccf1ed 100644 --- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -82,112 +82,133 @@ private: * @param[in] window Output region on which to execute the kernel. */ template - void pooling2_f32(const Window &window_input, const Window &window); + void pooling2_f32_nchw(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for float16_t. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling2_f16(const Window &window_input, const Window &window); + void pooling2_f16_nchw(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for 8bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling2_q8(const Window &window_input, const Window &window); + void pooling2_q8_nchw(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for 8bit asymmetric fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling2_qasymm8(const Window &window_input, const Window &window); + void pooling2_qasymm8_nchw(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for 16bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling2_q16(const Window &window_input, const Window &window); + void pooling2_q16_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling3_f32(const Window &window_input, const Window &window); + void pooling3_f32_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling3_f16(const Window &window_input, const Window &window); + void pooling3_f16_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 8bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling3_q8(const Window &window_input, const Window &window); + void pooling3_q8_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 8bit quantized fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling3_qasymm8(const Window &window_input, const Window &window); + void pooling3_qasymm8_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 16bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling3_q16(const Window &window_input, const Window &window); + void pooling3_q16_nchw(const Window &window_input, const Window &window); /** Function to perform 7x7 pooling. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void pooling7_f32(const Window &window_input, const Window &window); + void pooling7_f32_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 8bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void poolingMxN_q8(const Window &window_input, const Window &window); + void poolingMxN_q8_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 8-bit quantized. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void poolingMxN_qasymm8(const Window &window_input, const Window &window); + void poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window); + /** Function to perform MxN pooling for 8-bit quantized. (NHWC) + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 16bit fixed point. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void poolingMxN_q16(const Window &window_input, const Window &window); + void poolingMxN_q16_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 16-bit floating point values. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void poolingMxN_f16(const Window &window_input, const Window &window); + void poolingMxN_f16_nchw(const Window &window_input, const Window &window); + /** Function to perform MxN pooling for 16-bit floating point values. (NHWC) + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void poolingMxN_f16_nhwc(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 32-bit floating point values. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ template - void poolingMxN_f32(const Window &window_input, const Window &window); + void poolingMxN_f32_nchw(const Window &window_input, const Window &window); + /** Function to perform MxN pooling for 32-bit floating point values (NHWC). + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void poolingMxN_f32_nhwc(const Window &window_input, const Window &window); /** Common signature for all the specialised Pooling functions * * @param[in] window_input Input region on which to execute the kernel. diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h index 3ac0844798..4224f75c77 100644 --- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h @@ -72,6 +72,7 @@ private: NEPoolingLayerKernel _pooling_layer_kernel; NEFillBorderKernel _border_handler; bool _is_global_pooling_layer; + DataLayout _data_layout; }; } #endif /* __ARM_COMPUTE_NEPOOLINGLAYER_H__ */ diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index b6af51733a..ffb6d08993 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -53,20 +53,24 @@ namespace void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h) { TensorShape output_shape{ input->tensor_shape() }; - output_shape.set(0, pooled_w); - output_shape.set(1, pooled_h); + output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH), pooled_w); + output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT), pooled_h); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); } -template +template inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) { - int start_x = id.x() * stride_x - pad_x; - int start_y = id.y() * stride_y - pad_y; - const int end_x = std::min(start_x + pool_size_x, upper_bound_w); - const int end_y = std::min(start_y + pool_size_y, upper_bound_h); + const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + int start_x = id[idx_width] * stride_x - pad_x; + int start_y = id[idx_height] * stride_y - pad_y; + + const int end_x = std::min(start_x + pool_size_x, upper_bound_w); + const int end_y = std::min(start_y + pool_size_y, upper_bound_h); if(exclude_padding) { start_x = std::max(0, start_x); @@ -175,7 +179,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w) + || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h)); } return Status{}; @@ -193,12 +199,16 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen BorderSize &border_size, unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y) { + // Get data layout + DataLayout data_layout = input->data_layout(); unsigned int num_elems_read_per_iteration = 0; unsigned int num_elems_horizontal_window = 0; int pool_stride_x = 0; int pool_stride_y = 0; - const int input_width = input->dimension(0); - const int input_height = input->dimension(1); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int input_width = input->dimension(idx_width); + const int input_height = input->dimension(idx_height); const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); const int pool_pad_right = pad_stride_info.pad_right(); @@ -206,18 +216,22 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen const int pool_pad_left = pad_stride_info.pad_left(); const int pool_pad_bottom = pad_stride_info.pad_bottom(); const bool is_square = pool_size_x == pool_size_y; + // Check output dimensions - std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), - input->dimension(1), + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width), + input->dimension(idx_height), pool_size_x, pool_size_y, pad_stride_info); + auto_init(input, output, pooled_w, pooled_h); //If it's not squared and optimized will be executed the MxN num_elems_read_per_iteration = 1; num_elems_processed_per_iteration = 1; num_elems_horizontal_window = 1; + const bool is_nhwc = data_layout == DataLayout::NHWC; + if(is_square) { switch(input->data_type()) @@ -239,6 +253,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } break; case DataType::QASYMM8: + if(is_nhwc) + { + num_elems_processed_per_iteration = 8; + break; + } switch(pool_size_x) { case 2: @@ -273,6 +292,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: + if(is_nhwc) + { + num_elems_processed_per_iteration = 8; + break; + } switch(pool_size_x) { case 2: @@ -291,6 +315,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: + if(is_nhwc) + { + num_elems_processed_per_iteration = 4; + break; + } switch(pool_size_x) { case 2: @@ -313,35 +342,61 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen break; } } - // Number of iterations in X dimension - const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; + else + { + if(is_nhwc) + { + if(DataType::QASYMM8 == input->data_type()) + { + num_elems_processed_per_iteration = 8; + } + else + { + num_elems_processed_per_iteration = 4; + } + } + } - // Upper limit for the number of right/bottom border elements that are accessed - const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height; + bool window_changed = false; + Window win{}; + if(data_layout == DataLayout::NCHW) + { + // Number of iterations in X dimension + const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; - border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); - border_size.right = std::max(upper_bound_w, pool_pad_right); - border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); - bool window_changed = false; + // Upper limit for the number of right/bottom border elements that are accessed + const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height; - TensorShape output_shape{ input->tensor_shape() }; - output_shape.set(0, pooled_w); - output_shape.set(1, pooled_h); - TensorInfo output_info(input->clone()->set_tensor_shape(output_shape)); + border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); + border_size.right = std::max(upper_bound_w, pool_pad_right); + border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); - Window win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration)); - AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom); + TensorShape output_shape{ input->tensor_shape() }; + output_shape.set(0, pooled_w); + output_shape.set(1, pooled_h); + TensorInfo output_info(input->clone()->set_tensor_shape(output_shape)); + + win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration)); + AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom); - if(output->total_size() != 0) - { AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window); window_changed = update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); } else { - window_changed = update_window_and_padding(win, input_access); + TensorShape output_shape{ input->tensor_shape() }; + output_shape.set(1, pooled_w); + output_shape.set(2, pooled_h); + TensorInfo output_info(input->clone()->set_tensor_shape(output_shape)); + + win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); } Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -368,18 +423,25 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons const bool exclude_padding = pool_info.exclude_padding(); const bool is_global_pooling = pool_info.is_global_pooling(); const int pool_stride_x = pad_stride_info.stride().first; + unsigned int pool_size_x = 0; + unsigned int pool_size_y = 0; + + // Get data layout + const DataLayout data_layout = input->info()->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Update pool size in case of global pooling - const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width; - const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height; + pool_size_x = is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width; + pool_size_y = is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height; // Validate pool info before calling scaled_dimensions ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y)); // Check output dimensions unsigned int pooled_w, pooled_h; - std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), - input->info()->dimension(1), + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width), + input->info()->dimension(idx_height), pool_size_x, pool_size_y, pad_stride_info); @@ -398,6 +460,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons // Get data type const DataType data_type = input->info()->data_type(); + const bool is_nchw = data_layout == DataLayout::NCHW; // Select appropriate function if(data_type == DataType::QS8) @@ -410,10 +473,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling2_q8; + _func = &NEPoolingLayerKernel::pooling2_q8_nchw; break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_q8; + _func = &NEPoolingLayerKernel::pooling2_q8_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -423,10 +486,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling3_q8; + _func = &NEPoolingLayerKernel::pooling3_q8_nchw; break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_q8; + _func = &NEPoolingLayerKernel::pooling3_q8_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -436,7 +499,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q8; + _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -449,7 +512,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q8; + _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -463,10 +526,24 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8 : &NEPoolingLayerKernel::pooling2_qasymm8; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8_nchw : &NEPoolingLayerKernel::pooling2_qasymm8_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_qasymm8; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -477,10 +554,24 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8 : &NEPoolingLayerKernel::pooling3_qasymm8; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8_nchw : &NEPoolingLayerKernel::pooling3_qasymm8_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_qasymm8; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -491,10 +582,24 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8 : &NEPoolingLayerKernel::poolingMxN_qasymm8; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw : &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_qasymm8; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -511,10 +616,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling2_q16; + _func = &NEPoolingLayerKernel::pooling2_q16_nchw; break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_q16; + _func = &NEPoolingLayerKernel::pooling2_q16_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -524,10 +629,10 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling3_q16; + _func = &NEPoolingLayerKernel::pooling3_q16_nchw; break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_q16; + _func = &NEPoolingLayerKernel::pooling3_q16_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -537,7 +642,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q16; + _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -550,7 +655,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q16; + _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw; break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -567,13 +672,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw : &NEPoolingLayerKernel::pooling2_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16 : &NEPoolingLayerKernel::pooling2_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw : &NEPoolingLayerKernel::pooling2_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_f16; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling2_f16_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -583,13 +709,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw : &NEPoolingLayerKernel::pooling3_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16 : &NEPoolingLayerKernel::pooling3_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw : &NEPoolingLayerKernel::pooling3_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_f16; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling3_f16_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -599,13 +746,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw : &NEPoolingLayerKernel::poolingMxN_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw : &NEPoolingLayerKernel::poolingMxN_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_f16; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -618,13 +786,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw : &NEPoolingLayerKernel::poolingMxN_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16 : &NEPoolingLayerKernel::poolingMxN_f16; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw : &NEPoolingLayerKernel::poolingMxN_f16_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc : &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_f16; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -641,13 +830,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw : &NEPoolingLayerKernel::pooling2_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw : &NEPoolingLayerKernel::pooling2_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_f32; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling2_f32_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -657,13 +867,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw : &NEPoolingLayerKernel::pooling3_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw : &NEPoolingLayerKernel::pooling3_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_f32; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling3_f32_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -673,13 +904,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw : &NEPoolingLayerKernel::pooling7_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32 : &NEPoolingLayerKernel::pooling7_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw : &NEPoolingLayerKernel::pooling7_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling7_f32; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::pooling7_f32_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -689,13 +941,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw : &NEPoolingLayerKernel::poolingMxN_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw : &NEPoolingLayerKernel::poolingMxN_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_f32; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -708,13 +981,34 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons switch(pool_type) { case PoolingType::AVG: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw : &NEPoolingLayerKernel::poolingMxN_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::L2: - _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32 : &NEPoolingLayerKernel::poolingMxN_f32; + if(is_nchw) + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw : &NEPoolingLayerKernel::poolingMxN_f32_nchw; + } + else + { + _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc : &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_f32; + if(is_nchw) + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw; + } + else + { + _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc; + } break; default: ARM_COMPUTE_ERROR("Unsupported pooling type!"); @@ -729,7 +1023,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons } template -void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -794,7 +1088,7 @@ void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -908,7 +1202,7 @@ void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Wi } template -void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling2_q16_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -973,7 +1267,7 @@ void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC Iterator input(_input, window_input); @@ -1012,7 +1306,7 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - const float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float16x4_t scale_v = vdup_n_f16(scale); // Perform pooling const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); @@ -1043,7 +1337,7 @@ void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC Iterator input(_input, window_input); @@ -1078,7 +1372,7 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { - const float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float16x8_t scale_v = vdupq_n_f16(scale); res = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1])))); } @@ -1105,7 +1399,7 @@ void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1141,7 +1435,7 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling @@ -1168,7 +1462,7 @@ void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1244,7 +1538,7 @@ void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1364,7 +1658,7 @@ void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Wi } template -void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling3_q16_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1435,7 +1729,7 @@ void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1474,7 +1768,7 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling @@ -1503,7 +1797,7 @@ void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window } template -void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1532,7 +1826,7 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window if(pooling_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float scale = calculate_avg_scale(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling @@ -1586,7 +1880,7 @@ void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window } template -void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1640,7 +1934,7 @@ void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Windo } template -void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_q16_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1690,7 +1984,7 @@ void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Wind } template -void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC Iterator input(_input, window_input); @@ -1716,7 +2010,7 @@ void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Wind if(pooling_type != PoolingType::MAX) { // Calculate scale - const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); // Perform pooling @@ -1813,7 +2107,116 @@ void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Wind } template -void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window) +{ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height; + const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); + const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); + const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); + const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom); + + float16x8_t vres; + + execute_window_loop(window, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + if(pooling_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float16x8_t scale_v = vdupq_n_f16(scale); + + // Perform pooling + vres = vdupq_n_f16(0.0f); + + for(int y = 0; y < pool_size_y; ++y) + { + if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start()) + { + continue; + } + + for(int x = 0; x < pool_size_x; ++x) + { + if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start()) + { + continue; + } + + const float16x8_t data = vld1q_f16(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().z())); + + // Get power of 2 in case of l2 pooling and accumulate + if(pooling_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } + } + } + // Divide by scale + vres = vmulq_f16(vres, scale_v); + } + else + { + vres = vdupq_n_f16(std::numeric_limits::lowest()); + for(int y = 0; y < pool_size_y; ++y) + { + if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start()) + { + continue; + } + + for(int x = 0; x < pool_size_x; ++x) + { + if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start()) + { + continue; + } + + const float16x8_t data = vld1q_f16(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().z())); + vres = vmaxq_f16(vres, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); + vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal)); + } + + // Store result + vst1q_f16(reinterpret_cast(output.ptr()), vres); + }, + input, output); + +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + ARM_COMPUTE_UNUSED(window_input); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a"); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +} + +template +void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1837,7 +2240,7 @@ void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Wind if(pooling_type != PoolingType::MAX) { // Calculate scale - const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); // Perform pooling float32x4_t vres = vdupq_n_f32(0.0f); @@ -1936,7 +2339,109 @@ void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Wind } template -void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window) +void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height; + const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); + const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); + const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); + const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom); + + float32x4_t vres; + + execute_window_loop(window, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + if(pooling_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float32x4_t scale_v = vdupq_n_f32(scale); + + // Perform pooling + vres = vdupq_n_f32(0.0f); + + for(int y = 0; y < pool_size_y; ++y) + { + if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start()) + { + continue; + } + + for(int x = 0; x < pool_size_x; ++x) + { + if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start()) + { + continue; + } + + const float32x4_t data = vld1q_f32(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().z())); + + // Get power of 2 in case of l2 pooling and accumulate + if(pooling_type == PoolingType::L2) + { + vres = vmlaq_f32(vres, data, data); + } + else + { + vres = vaddq_f32(vres, data); + } + } + } + // Divide by scale + vres = vmulq_f32(vres, scale_v); + } + else + { + vres = vdupq_n_f32(std::numeric_limits::lowest()); + for(int y = 0; y < pool_size_y; ++y) + { + if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start()) + { + continue; + } + + for(int x = 0; x < pool_size_x; ++x) + { + if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start()) + { + continue; + } + + const float32x4_t data = vld1q_f32(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().z())); + vres = vmaxq_f32(vres, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if(pooling_type == PoolingType::L2) + { + float32x4_t sqrt_reciprocal = vrsqrteq_f32(vres); + vres = vmulq_f32(vres, vmulq_f32(vrsqrtsq_f32(vmulq_f32(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal)); + } + + // Store result + vst1q_f32(reinterpret_cast(output.ptr()), vres); + }, + input, output); +} + +template +void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window) { Iterator input(_input, window_input); Iterator output(_output, window); @@ -1963,7 +2468,7 @@ void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const uint32_t sres = 0; // Calculate scale - const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); // Perform pooling for(int y = 0; y < pool_size_y; ++y) @@ -2031,6 +2536,101 @@ void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const input, output); } +template +void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width; + const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height; + const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); + const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); + const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); + const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + if(pooling_type != PoolingType::MAX) + { + uint32x4_t vres1 = vdupq_n_u32(0); + uint32x4_t vres2 = vdupq_n_u32(0); + + // Calculate scale + const float scale = calculate_avg_scale(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float32x4_t scale_v = vdupq_n_f32(scale); + + // Perform pooling + for(int y = 0; y < pool_size_y; ++y) + { + if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start()) + { + continue; + } + + for(int x = 0; x < pool_size_x; ++x) + { + if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start()) + { + continue; + } + + const uint8x8_t data = vld1_u8(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().z())); + + const uint16x8_t data_u16 = vmovl_u8(data); + vres1 = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16))); + vres2 = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16))); + } + } + // Divide by scale + vres1 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres1), scale_v)); + vres2 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres2), scale_v)); + + uint8x8_t res = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2))); + + // Store result + vst1_u8(output.ptr(), res); + } + else + { + uint8x8_t vres = vdup_n_u8(0); + + for(int y = 0; y < pool_size_y; ++y) + { + if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start()) + { + continue; + } + + for(int x = 0; x < pool_size_x; ++x) + { + if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start()) + { + continue; + } + + const uint8x8_t data = vld1_u8(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() + + (y - pool_pad_top) * _input->info()->strides_in_bytes().z())); + vres = vmax_u8(vres, data); + } + } + + // Store result + vst1_u8(output.ptr(), vres); + } + }, + input, output); +} + Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); @@ -2040,16 +2640,24 @@ Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInf unsigned int num_elems_processed_per_iteration = 0; BorderSize border_size(0); - const bool is_global_pooling = pool_info.is_global_pooling(); - const unsigned int pool_size_x = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width; - const unsigned int pool_size_y = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height; + const bool is_global_pooling = pool_info.is_global_pooling(); + unsigned int pool_size_x = 0; + unsigned int pool_size_y = 0; + + // Get data layout + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size().width; + pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size().height; // Validate pool info before calling scaled_dimensions ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y)); // Check output dimensions - std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0), - input->dimension(1), + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width), + input->dimension(idx_height), pool_size_x, pool_size_y, pool_info.pad_stride_info()); @@ -2073,39 +2681,48 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second; const unsigned int pool_size = _pool_info.pool_size().width; - // Set step for input in x and y direction for the input - Window window_input(window); - unsigned int window_x_inc = 0; - switch(_input->info()->data_type()) + Window window_input(window); + if(_input->info()->data_layout() == DataLayout::NCHW) { - case DataType::QS8: - case DataType::QS16: - case DataType::F16: - { - window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; - break; - } - case DataType::QASYMM8: + // Set step for input in x and y direction for the input + unsigned int window_x_inc = 0; + switch(_input->info()->data_type()) { - window_x_inc = pool_stride_x; - if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) + case DataType::QS8: + case DataType::QS16: + case DataType::F16: { window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + break; + } + case DataType::QASYMM8: + { + window_x_inc = pool_stride_x; + if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) + { + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + } + break; + } + case DataType::F32: + { + window_x_inc = pool_stride_x; + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); } - break; - } - case DataType::F32: - { - window_x_inc = pool_stride_x; - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported"); } + window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); + window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); + } + else + { + window_input.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration)); + window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x)); + window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y)); } - window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); - window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); // Run function (this->*_func)(window_input, window); diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index bc0b6f86d3..cbfd68485f 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -31,7 +31,7 @@ using namespace arm_compute; NEPoolingLayer::NEPoolingLayer() - : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false) + : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW) { } @@ -40,17 +40,31 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay // Check if we have Global Pooling Layer _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size().width) && (input->info()->dimension(1) == pool_info.pool_size().height); + // Get data layout + _data_layout = input->info()->data_layout(); + // Configure pooling kernel _pooling_layer_kernel.configure(input, output, pool_info); - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding()) + switch(_data_layout) { - zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); + case DataLayout::NCHW: + { + // Configure border depending on operation required (quantize border in case of asymmetric data_type) + BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; + PixelValue zero_value(0.f); + if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding()) + { + zero_value = PixelValue(static_cast(input->info()->quantization_info().offset)); + } + _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value); + break; + } + case DataLayout::NHWC: + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); } - _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value); } Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info) @@ -60,9 +74,20 @@ Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *out void NEPoolingLayer::run() { - // Fill border - NEScheduler::get().schedule(&_border_handler, Window::DimY); + switch(_data_layout) + { + case DataLayout::NCHW: + // Fill border + NEScheduler::get().schedule(&_border_handler, Window::DimY); - // Run pooling layer - NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY); + // Run pooling layer + NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY); + break; + case DataLayout::NHWC: + // Run pooling layer + NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX); + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } } \ No newline at end of file diff --git a/tests/benchmark/CL/PoolingLayer.cpp b/tests/benchmark/CL/PoolingLayer.cpp index 9c567202de..f9fefaf141 100644 --- a/tests/benchmark/CL/PoolingLayer.cpp +++ b/tests/benchmark/CL/PoolingLayer.cpp @@ -47,7 +47,8 @@ namespace benchmark { namespace { -const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 }); +const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 }); +const auto data_layouts = framework::dataset::make("DataLayout", { DataLayout::NCHW }); } // namespace using CLPoolingLayerFixture = PoolingLayerFixture; @@ -55,74 +56,88 @@ using CLPoolingLayerFixture = PoolingLayerFixture; @@ -55,74 +56,88 @@ using GCPoolingLayerFixture = PoolingLayerFixture - void setup(TensorShape src_shape, TensorShape dst_shape, PoolingLayerInfo info, DataType data_type, int batches) + void setup(TensorShape src_shape, TensorShape dst_shape, PoolingLayerInfo info, DataType data_type, DataLayout data_layout, int batches) { // Set batched in source and destination shapes const unsigned int fixed_point_position = 4; @@ -50,8 +50,8 @@ public: dst_shape.set(dst_shape.num_dimensions(), batches); // Create tensors - src = create_tensor(src_shape, data_type, 1, fixed_point_position); - dst = create_tensor(dst_shape, data_type, 1, fixed_point_position); + src = create_tensor(src_shape, data_type, 1, fixed_point_position, QuantizationInfo(), data_layout); + dst = create_tensor(dst_shape, data_type, 1, fixed_point_position, QuantizationInfo(), data_layout); // Create and configure function pool_layer.configure(&src, &dst, info); diff --git a/tests/datasets/PoolingLayerDataset.h b/tests/datasets/PoolingLayerDataset.h index 53e392fe69..36818010a5 100644 --- a/tests/datasets/PoolingLayerDataset.h +++ b/tests/datasets/PoolingLayerDataset.h @@ -55,7 +55,7 @@ public: std::stringstream description; description << "In=" << *_src_it << ":"; description << "Out=" << *_dst_it << ":"; - description << "Info=" << *_infos_it; + description << "Info=" << *_infos_it << ":"; return description.str(); } @@ -116,6 +116,7 @@ public: add_config(TensorShape(60U, 52U, 3U, 2U), TensorShape(13U, 11U, 32U), PoolingLayerInfo(PoolingType::AVG, Size2D(100, 100), PadStrideInfo(5, 5, 50, 50), true)); // Asymmetric padding add_config(TensorShape(112U, 112U, 32U), TensorShape(56U, 56U, 32U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))); + add_config(TensorShape(14U, 14U, 832U), TensorShape(7U, 7U, 832U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL))); } }; } // namespace datasets diff --git a/tests/validation/CL/GlobalPoolingLayer.cpp b/tests/validation/CL/GlobalPoolingLayer.cpp index 31e3fe0eb7..46752c4913 100644 --- a/tests/validation/CL/GlobalPoolingLayer.cpp +++ b/tests/validation/CL/GlobalPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,7 +59,9 @@ using CLGlobalPoolingLayerFixture = GlobalPoolingLayerValidationFixture, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); @@ -67,8 +69,9 @@ FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture, fra TEST_SUITE_END() TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", - DataType::F16))) +FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16); diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp index 9da4c55c78..79e526b9ec 100644 --- a/tests/validation/CL/PoolingLayer.cpp +++ b/tests/validation/CL/PoolingLayer.cpp @@ -57,7 +57,7 @@ const auto PoolingLayerDatasetQS = combine(combine(combine(framework::dataset::m /** Input data set for asymmetric data type */ const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(5, 7), Size2D(8, 9) })), framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })), - framework::dataset::make("ExcludePadding", { true, false })); + framework::dataset::make("ExcludePadding", { true })); constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */ constexpr AbsoluteTolerance tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ @@ -126,35 +126,40 @@ FIXTURE_DATA_TEST_CASE(RunSpecial, CLSpecialPoolingLayerFixture, framewor // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", - DataType::F32)))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", + DataType::F32))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", - DataType::F32)))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", + DataType::F32))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } -TEST_SUITE_END() +TEST_SUITE_END() // FP32 TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, - framework::dataset::make("DataType", DataType::F16)))) +FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", DataType::F16))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, - framework::dataset::make("DataType", DataType::F16)))) +FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", DataType::F16))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16); } -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // FP16 +TEST_SUITE_END() // Float template using CLPoolingLayerFixedPointFixture = PoolingLayerValidationFixedPointFixture; @@ -175,7 +180,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixedPointFixture, framew // Validate output validate(CLAccessor(_target), _reference, tolerance_qs8); } -TEST_SUITE_END() +TEST_SUITE_END() // QS8 TEST_SUITE(QS16) FIXTURE_DATA_TEST_CASE(RunTiny, CLPoolingLayerFixedPointFixture, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), combine(PoolingLayerDatasetQS, @@ -192,8 +197,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixedPointFixture, frame // Validate output validate(CLAccessor(_target), _reference, tolerance_qs16); } -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // QS16 +TEST_SUITE_END() // fixedPoint TEST_SUITE(Quantized) @@ -201,27 +206,29 @@ template using CLPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture; TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8, +FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8, framework::dataset::make("DataType", DataType::QASYMM8))), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), - QuantizationInfo(7.f / 255, 123) - }))) + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), + QuantizationInfo(7.f / 255, 123) + })), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8, +FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8, framework::dataset::make("DataType", DataType::QASYMM8))), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) }))) + framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_qasymm8); } -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // QASYMM8 +TEST_SUITE_END() // Quantized -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // PoolingLayer +TEST_SUITE_END() // CL } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp b/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp index 88372ffe24..162f189af5 100644 --- a/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp +++ b/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,7 +59,9 @@ using GCGlobalPoolingLayerFixture = GlobalPoolingLayerValidationFixture, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(GCAccessor(_target), _reference, tolerance_f32); @@ -67,8 +69,9 @@ FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture, fra TEST_SUITE_END() TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", - DataType::F16))) +FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", + DataType::F16)), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(GCAccessor(_target), _reference, tolerance_f16); diff --git a/tests/validation/GLES_COMPUTE/PoolingLayer.cpp b/tests/validation/GLES_COMPUTE/PoolingLayer.cpp index 1496ceec1c..ac1bd724ac 100644 --- a/tests/validation/GLES_COMPUTE/PoolingLayer.cpp +++ b/tests/validation/GLES_COMPUTE/PoolingLayer.cpp @@ -87,14 +87,17 @@ using GCPoolingLayerFixture = PoolingLayerValidationFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", - DataType::F32)))) +FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", + DataType::F32))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(GCAccessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", - DataType::F32)))) +FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", + DataType::F32))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(GCAccessor(_target), _reference, tolerance_f32); @@ -102,14 +105,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture, framework::Datase TEST_SUITE_END() TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, - framework::dataset::make("DataType", DataType::F16)))) +FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", DataType::F16))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(GCAccessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, - framework::dataset::make("DataType", DataType::F16)))) +FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", DataType::F16))), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(GCAccessor(_target), _reference, tolerance_f16); diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp index 3d554f0d25..6e7ec8ddc1 100644 --- a/tests/validation/Helpers.cpp +++ b/tests/validation/Helpers.cpp @@ -119,7 +119,8 @@ HarrisCornersParameters harris_corners_parameters() SimpleTensor convert_from_asymmetric(const SimpleTensor &src) { const QuantizationInfo &quantization_info = src.quantization_info(); - SimpleTensor dst{ src.shape(), DataType::F32, 1, 0 }; + SimpleTensor dst{ src.shape(), DataType::F32, 1, 0, QuantizationInfo(), src.data_layout() }; + for(int i = 0; i < src.num_elements(); ++i) { dst[i] = quantization_info.dequantize(src[i]); diff --git a/tests/validation/NEON/GlobalPoolingLayer.cpp b/tests/validation/NEON/GlobalPoolingLayer.cpp index 37950b059f..769780647b 100644 --- a/tests/validation/NEON/GlobalPoolingLayer.cpp +++ b/tests/validation/NEON/GlobalPoolingLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,7 +58,9 @@ using NEGlobalPoolingLayerFixture = GlobalPoolingLayerValidationFixture, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunGlobalPooling, NEGlobalPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp index 350a7b883b..b44f945eb7 100644 --- a/tests/validation/NEON/PoolingLayer.cpp +++ b/tests/validation/NEON/PoolingLayer.cpp @@ -59,7 +59,7 @@ const auto PoolingLayerDatasetQS = combine(combine(combine(framework::dataset::m const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(4, 4), Size2D(9, 9), Size2D(3, 7), Size2D(7, 8) })), framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })), - framework::dataset::make("ExcludePadding", { true, false })); + framework::dataset::make("ExcludePadding", { true })); constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -130,37 +130,42 @@ FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecialPoolingLayerFixture, framewor // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", - DataType::F32)))) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", + DataType::F32))), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType", - DataType::F32)))) +FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", + DataType::F32))), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -TEST_SUITE_END() +TEST_SUITE_END() // FP32 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, - framework::dataset::make("DataType", DataType::F16)))) +FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", DataType::F16))), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, - framework::dataset::make("DataType", DataType::F16)))) +FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, + framework::dataset::make("DataType", DataType::F16))), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -TEST_SUITE_END() -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -TEST_SUITE_END() +TEST_SUITE_END() // FP16 +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +TEST_SUITE_END() // Float template using NEPoolingLayerFixedPointFixture = PoolingLayerValidationFixedPointFixture; @@ -181,7 +186,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixedPointFixture, framew // Validate output validate(Accessor(_target), _reference, tolerance_qs8); } -TEST_SUITE_END() +TEST_SUITE_END() // QS8 TEST_SUITE(QS16) FIXTURE_DATA_TEST_CASE(RunTiny, NEPoolingLayerFixedPointFixture, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), combine(PoolingLayerDatasetQS, @@ -198,8 +203,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixedPointFixture, frame // Validate output validate(Accessor(_target), _reference, tolerance_qs16); } -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // QS16 +TEST_SUITE_END() // FixedPoint TEST_SUITE(Quantized) @@ -207,27 +212,28 @@ template using NEPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture; TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8, +FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8, framework::dataset::make("DataType", DataType::QASYMM8))), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), - QuantizationInfo(7.f / 255, 123) - }))) + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127), + QuantizationInfo(7.f / 255, 123) + })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8, +FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8, framework::dataset::make("DataType", DataType::QASYMM8))), - framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) }))) + framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -TEST_SUITE_END() -TEST_SUITE_END() - -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // QASYMM8 +TEST_SUITE_END() // Quantized +TEST_SUITE_END() // PoolingLayer +TEST_SUITE_END() // NEON } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h index 3bbb403ae7..3c28b3b64d 100644 --- a/tests/validation/fixtures/PoolingLayerFixture.h +++ b/tests/validation/fixtures/PoolingLayerFixture.h @@ -47,14 +47,20 @@ class PoolingLayerValidationGenericFixture : public framework::Fixture { public: template - void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, int fractional_bits, QuantizationInfo quantization_info) + void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout, int fractional_bits, QuantizationInfo quantization_info) { _fractional_bits = fractional_bits; _quantization_info = quantization_info; _pool_info = pool_info; - _target = compute_target(shape, pool_info, data_type, fractional_bits, quantization_info); - _reference = compute_reference(shape, pool_info, data_type, fractional_bits, quantization_info); + // Change shape in case of NHWC. + if(data_layout == DataLayout::NHWC) + { + permute(shape, PermutationVector(2U, 0U, 1U)); + } + + _target = compute_target(shape, pool_info, data_type, data_layout, fractional_bits, quantization_info); + _reference = compute_reference(shape, pool_info, data_type, data_layout, fractional_bits, quantization_info); } protected: @@ -79,10 +85,10 @@ protected: } TensorType compute_target(const TensorShape &shape, PoolingLayerInfo info, - DataType data_type, int fixed_point_position, QuantizationInfo quantization_info) + DataType data_type, DataLayout data_layout, int fixed_point_position, QuantizationInfo quantization_info) { // Create tensors - TensorType src = create_tensor(shape, data_type, 1, fixed_point_position, quantization_info); + TensorType src = create_tensor(shape, data_type, 1, fixed_point_position, quantization_info, data_layout); TensorType dst; // Create and configure function @@ -109,10 +115,10 @@ protected: } SimpleTensor compute_reference(const TensorShape &shape, PoolingLayerInfo info, - DataType data_type, int fixed_point_position, QuantizationInfo quantization_info) + DataType data_type, DataLayout data_layout, int fixed_point_position, QuantizationInfo quantization_info) { // Create reference - SimpleTensor src{ shape, data_type, 1, fixed_point_position, quantization_info }; + SimpleTensor src{ shape, data_type, 1, fixed_point_position, quantization_info, data_layout }; // Fill reference fill(src); @@ -132,10 +138,10 @@ class PoolingLayerValidationFixture : public PoolingLayerValidationGenericFixtur { public: template - void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type) + void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout) { PoolingLayerValidationGenericFixture::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding), - data_type, 0, QuantizationInfo()); + data_type, data_layout, 0, QuantizationInfo()); } }; @@ -147,7 +153,7 @@ public: void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, int fractional_bits) { PoolingLayerValidationGenericFixture::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding), - data_type, fractional_bits, QuantizationInfo()); + data_type, DataLayout::NCHW, fractional_bits, QuantizationInfo()); } }; @@ -156,10 +162,11 @@ class PoolingLayerValidationQuantizedFixture : public PoolingLayerValidationGene { public: template - void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, QuantizationInfo quantization_info) + void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, + QuantizationInfo quantization_info, DataLayout data_layout = DataLayout::NCHW) { PoolingLayerValidationGenericFixture::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding), - data_type, 0, quantization_info); + data_type, data_layout, 0, quantization_info); } }; @@ -171,7 +178,7 @@ public: void setup(TensorShape src_shape, TensorShape dst_shape, PoolingLayerInfo pool_info, DataType data_type) { ARM_COMPUTE_UNUSED(dst_shape); - PoolingLayerValidationGenericFixture::setup(src_shape, pool_info, data_type, 0, QuantizationInfo()); + PoolingLayerValidationGenericFixture::setup(src_shape, pool_info, data_type, DataLayout::NCHW, 0, QuantizationInfo()); } }; @@ -180,9 +187,9 @@ class GlobalPoolingLayerValidationFixture : public PoolingLayerValidationGeneric { public: template - void setup(TensorShape shape, PoolingType pool_type, DataType data_type) + void setup(TensorShape shape, PoolingType pool_type, DataType data_type, DataLayout data_layout = DataLayout::NCHW) { - PoolingLayerValidationGenericFixture::setup(shape, PoolingLayerInfo(pool_type), data_type, 0, QuantizationInfo()); + PoolingLayerValidationGenericFixture::setup(shape, PoolingLayerInfo(pool_type), data_type, DataLayout::NCHW, 0, QuantizationInfo()); } }; diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp index c14ab98c28..071c20ed56 100644 --- a/tests/validation/reference/PoolingLayer.cpp +++ b/tests/validation/reference/PoolingLayer.cpp @@ -23,6 +23,7 @@ */ #include "PoolingLayer.h" +#include "Permute.h" #include "arm_compute/core/Types.h" #include "tests/validation/FixedPoint.h" #include "tests/validation/Helpers.h" @@ -54,8 +55,8 @@ TensorShape calculate_output_shape(TensorShape shape, const PoolingLayerInfo &in } } // namespace -template ::value, int>::type> -SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo &info) +template +SimpleTensor pooling_layer_nchw(const SimpleTensor &src, SimpleTensor &dst, const PoolingLayerInfo &info) { ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y())); @@ -74,9 +75,6 @@ SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo const auto h_src = static_cast(src.shape()[1]); const int upper_dims = src.shape().total_size() / (w_src * h_src); - // Create reference - SimpleTensor dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() }; - const auto w_dst = static_cast(dst.shape()[0]); const auto h_dst = static_cast(dst.shape()[1]); @@ -173,6 +171,10 @@ SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo { ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y())); + const auto w_src = static_cast(src.shape()[0]); + const auto h_src = static_cast(src.shape()[1]); + const int upper_dims = src.shape().total_size() / (w_src * h_src); + const int pool_size_x = info.is_global_pooling() ? src.shape().x() : info.pool_size().width; const int pool_size_y = info.is_global_pooling() ? src.shape().y() : info.pool_size().height; PoolingType type = info.pool_type(); @@ -184,10 +186,6 @@ SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo int pad_bottom = info.pad_stride_info().pad_bottom(); bool exclude_padding = info.exclude_padding(); - const auto w_src = static_cast(src.shape()[0]); - const auto h_src = static_cast(src.shape()[1]); - const int upper_dims = src.shape().total_size() / (w_src * h_src); - // Create reference SimpleTensor dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() }; @@ -299,6 +297,25 @@ SimpleTensor pooling_layer(const SimpleTensor &src, c return dst; } +template ::value, int>::type> +SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo &info) +{ + if(src.data_layout() == DataLayout::NHWC) + { + SimpleTensor src_nchw = reference::permute(src, PermutationVector(1U, 2U, 0U)); + SimpleTensor dst{ calculate_output_shape(src_nchw.shape(), info), src_nchw.data_type(), 1, src_nchw.fixed_point_position() }; + + pooling_layer_nchw(src_nchw, dst, info); + + return reference::permute(dst, PermutationVector(2U, 0U, 1U)); + } + else + { + SimpleTensor dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() }; + return pooling_layer_nchw(src, dst, info); + } +} + template SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo &info); template SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo &info); template SimpleTensor pooling_layer(const SimpleTensor &src, const PoolingLayerInfo &info); -- cgit v1.2.1