diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2021-06-29 10:08:46 +0100 |
---|---|---|
committer | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2021-06-29 14:09:52 +0000 |
commit | 72b56875b9bb30a9ed1d2ad38ec51fc88e435c35 (patch) | |
tree | 0ce37505da5cfd8a4b75000432d66e75a2b48a32 | |
parent | 93b75e0c072c3cc5654fcdf6aed1068b40012081 (diff) | |
download | ComputeLibrary-72b56875b9bb30a9ed1d2ad38ec51fc88e435c35.tar.gz |
Enable global pooling optimization on OpenCL
- Add loop unrolling on X and use POOL_X and POOL_Y defines for the for
loop
Resolves COMPMID-4573
Change-Id: I33cb825cfb55912ccb0ab9d03bd33a3dab4c8b44
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5872
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | src/core/CL/cl_kernels/pooling_layer.cl | 28 | ||||
-rw-r--r-- | tests/validation/Helpers.cpp | 24 | ||||
-rw-r--r-- | tests/validation/Helpers.h | 10 |
3 files changed, 53 insertions, 9 deletions
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index 8944c9b1ac..d63a2e51e8 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -711,8 +711,8 @@ __kernel void pooling_layer_MxN_nhwc( int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT; int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT; #else //DST_BATCH_SIZE != 1 - int idx_out_h = GET_SPATIAL_IDX(2, 1, 0); - int idx_out_n = 0; + int idx_out_h = GET_SPATIAL_IDX(2, 1, 0); + int idx_out_n = 0; #endif // DST_BATCH_SIZE != 1 __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; @@ -726,10 +726,10 @@ __kernel void pooling_layer_MxN_nhwc( int idx_in_w = idx_out_w * STRIDE_X - PAD_X; int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; - int pool_x_s = max((int)0, -idx_in_w); - int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); - int pool_y_s = max((int)0, -idx_in_h); - int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); + int pool_x_s = max((int)0, -idx_in_w); + int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); #if defined(EXCLUDE_PADDING) int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); @@ -737,17 +737,27 @@ __kernel void pooling_layer_MxN_nhwc( int filter_size = POOL_SIZE_X * POOL_SIZE_Y; #endif // defined(EXCLUDE_PADDING) +#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + // Global pooling path + for(int y = 0; y < POOL_SIZE_Y; ++y) + { +#pragma unroll 8 + for(int x = 0; x < POOL_SIZE_X; ++x) + { +#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 for(int y = pool_y_s; y < pool_y_e; ++y) { +#pragma unroll 8 for(int x = pool_x_s; x < pool_x_e; ++x) { +#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0; #if defined(FP_MIXED_PRECISION) // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); #else // defined(FP_MIXED_PRECISION) - data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)); + data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)); #endif // defined(FP_MIXED_PRECISION) #if defined(POOL_L2) @@ -757,9 +767,9 @@ __kernel void pooling_layer_MxN_nhwc( res0 = POOL_OP(res0, data0); } } - + #if defined(POOL_AVG) || defined(POOL_L2) - res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; #endif // defined(POOL_AVG) || defined(POOL_L2) #if defined(POOL_L2) diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp index 0f5d5c5101..237a5a517c 100644 --- a/tests/validation/Helpers.cpp +++ b/tests/validation/Helpers.cpp @@ -349,6 +349,30 @@ void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &d } } +void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout) +{ + if(data_layout == DataLayout::NHWC) + { + constexpr unsigned int lower = 1U; + constexpr unsigned int upper = 4U; + + std::uniform_int_distribution<unsigned int> distribution(lower, upper); + size_t seed_offset = 0; + + for(ITensor *tensor : tensors) + { + ARM_COMPUTE_ERROR_ON(!tensor->info()->is_resizable()); + + std::mt19937 gen(library->seed() + seed_offset++); + + const unsigned int top = distribution(gen); + const unsigned int bottom = distribution(gen); + + tensor->info()->extend_padding(PaddingSize(top, 0U, bottom, 0U)); + } + } +} + template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord); template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord); template void get_tile(const SimpleTensor<int> &in, SimpleTensor<int> &roi, const Coordinates &coord); diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h index 00e588e7b7..a8804ad7e7 100644 --- a/tests/validation/Helpers.h +++ b/tests/validation/Helpers.h @@ -240,6 +240,16 @@ std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC */ void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC, bool only_right_pad = false); + +/** Add random padding along the Y axis (between 1 and 4 rows per side) to all the input tensors. + * This is used in our validation suite in order to simulate implicit padding addition after configuring, but before allocating. + * + * @param[in] tensors List of tensors to add padding to + * @param[in] data_layout (Optional) Data layout of the operator + * + * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC + */ +void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC); } // namespace validation } // namespace test } // namespace arm_compute |