aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-06-29 10:08:46 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-06-29 14:09:52 +0000
commit72b56875b9bb30a9ed1d2ad38ec51fc88e435c35 (patch)
tree0ce37505da5cfd8a4b75000432d66e75a2b48a32
parent93b75e0c072c3cc5654fcdf6aed1068b40012081 (diff)
downloadComputeLibrary-72b56875b9bb30a9ed1d2ad38ec51fc88e435c35.tar.gz
Enable global pooling optimization on OpenCL
- Add loop unrolling on X and use POOL_X and POOL_Y defines for the for loop Resolves COMPMID-4573 Change-Id: I33cb825cfb55912ccb0ab9d03bd33a3dab4c8b44 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5872 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl28
-rw-r--r--tests/validation/Helpers.cpp24
-rw-r--r--tests/validation/Helpers.h10
3 files changed, 53 insertions, 9 deletions
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 8944c9b1ac..d63a2e51e8 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -711,8 +711,8 @@ __kernel void pooling_layer_MxN_nhwc(
int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
#else //DST_BATCH_SIZE != 1
- int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
- int idx_out_n = 0;
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
+ int idx_out_n = 0;
#endif // DST_BATCH_SIZE != 1
__global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
@@ -726,10 +726,10 @@ __kernel void pooling_layer_MxN_nhwc(
int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
- int pool_x_s = max((int)0, -idx_in_w);
- int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
- int pool_y_s = max((int)0, -idx_in_h);
- int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+ int pool_x_s = max((int)0, -idx_in_w);
+ int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
#if defined(EXCLUDE_PADDING)
int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -737,17 +737,27 @@ __kernel void pooling_layer_MxN_nhwc(
int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
#endif // defined(EXCLUDE_PADDING)
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ // Global pooling path
+ for(int y = 0; y < POOL_SIZE_Y; ++y)
+ {
+#pragma unroll 8
+ for(int x = 0; x < POOL_SIZE_X; ++x)
+ {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
for(int y = pool_y_s; y < pool_y_e; ++y)
{
+#pragma unroll 8
for(int x = pool_x_s; x < pool_x_e; ++x)
{
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
data0;
#if defined(FP_MIXED_PRECISION)
// In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
#else // defined(FP_MIXED_PRECISION)
- data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+ data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
#endif // defined(FP_MIXED_PRECISION)
#if defined(POOL_L2)
@@ -757,9 +767,9 @@ __kernel void pooling_layer_MxN_nhwc(
res0 = POOL_OP(res0, data0);
}
}
-
+
#if defined(POOL_AVG) || defined(POOL_L2)
- res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
#endif // defined(POOL_AVG) || defined(POOL_L2)
#if defined(POOL_L2)
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 0f5d5c5101..237a5a517c 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -349,6 +349,30 @@ void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &d
}
}
+void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout)
+{
+ if(data_layout == DataLayout::NHWC)
+ {
+ constexpr unsigned int lower = 1U;
+ constexpr unsigned int upper = 4U;
+
+ std::uniform_int_distribution<unsigned int> distribution(lower, upper);
+ size_t seed_offset = 0;
+
+ for(ITensor *tensor : tensors)
+ {
+ ARM_COMPUTE_ERROR_ON(!tensor->info()->is_resizable());
+
+ std::mt19937 gen(library->seed() + seed_offset++);
+
+ const unsigned int top = distribution(gen);
+ const unsigned int bottom = distribution(gen);
+
+ tensor->info()->extend_padding(PaddingSize(top, 0U, bottom, 0U));
+ }
+ }
+}
+
template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
template void get_tile(const SimpleTensor<int> &in, SimpleTensor<int> &roi, const Coordinates &coord);
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 00e588e7b7..a8804ad7e7 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -240,6 +240,16 @@ std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo
* @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC
*/
void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC, bool only_right_pad = false);
+
+/** Add random padding along the Y axis (between 1 and 4 rows per side) to all the input tensors.
+ * This is used in our validation suite in order to simulate implicit padding addition after configuring, but before allocating.
+ *
+ * @param[in] tensors List of tensors to add padding to
+ * @param[in] data_layout (Optional) Data layout of the operator
+ *
+ * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC
+ */
+void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC);
} // namespace validation
} // namespace test
} // namespace arm_compute