diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2017-08-02 13:19:48 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:35:24 +0000 |
commit | cb29283e0d65297f4756e202df07eac1107841e6 (patch) | |
tree | 22592fe8e4132110fd5f9f0df53afb3dc0ba26c9 | |
parent | 484e7b3724c0e77751b5bed05180271fd5376e5d (diff) | |
download | ComputeLibrary-cb29283e0d65297f4756e202df07eac1107841e6.tar.gz |
COMPMID-477 - Optimizing Pooling 3x3 with stride_x <= 3 on OpenCL
Change-Id: Ie000166307cdb5bfae00ebf84d35e49a6bfb9dbd
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/83372
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r-- | arm_compute/core/CL/kernels/CLPoolingLayerKernel.h | 3 | ||||
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 1 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/pooling_layer.cl | 255 | ||||
-rw-r--r-- | src/core/CL/kernels/CLPoolingLayerKernel.cpp | 76 |
4 files changed, 242 insertions, 93 deletions
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h index 6c5091ff9e..971e1506af 100644 --- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h @@ -49,7 +49,7 @@ public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: F16, F32. + * @param[in] input Source tensor. Data types supported: F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * Supported pooling sizes : 2, 3 and 7 @@ -65,6 +65,7 @@ private: ICLTensor *_output; PoolingLayerInfo _pool_info; BorderSize _border_size; + unsigned int _num_elems_processed_per_iteration; }; } #endif /*__ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__ */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index cda2c5afe1..000cffa9ee 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -232,6 +232,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "pixelwise_mul_int", "pixelwise_mul_int.cl" }, { "pooling_layer_2", "pooling_layer.cl" }, { "pooling_layer_3", "pooling_layer.cl" }, + { "pooling_layer_3_optimized", "pooling_layer.cl" }, { "pooling_layer_7", "pooling_layer.cl" }, { "remap_nearest_neighbour", "remap.cl" }, { "remap_bilinear", "remap.cl" }, diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index b7245203d4..06989aa15e 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -29,22 +29,143 @@ #define POOL_OP(x, y) (fmax((x), (y))) #endif /* POOL_AVG */ -float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) +#if STRIDE_X == 1 +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output) +#elif STRIDE_X == 2 /* STRIDE_X == 1 */ +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output) +#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */ +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output) +#endif /* STRIDE_X == 3 */ + +#define CONVERT_OP(data_type) convert_##data_type##4 +#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type) + +#define POOLING3x3_STRIDE1(res, input, output) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \ + \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \ + \ + values00 = POOL_OP(values00, values10); \ + values01 = POOL_OP(values01, values11); \ + values00 = POOL_OP(values00, values20); \ + values01 = POOL_OP(values01, values21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \ + }) + +#define POOLING3x3_STRIDE2(res, input, output) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21); \ + \ + values00 = POOL_OP(values00, values10); \ + values01 = POOL_OP(values01, values11); \ + values00 = POOL_OP(values00, values20); \ + values01 = POOL_OP(values01, values21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \ + }) + +#define POOLING3x3_STRIDE3(res, input, output) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + \ + data00 = POOL_OP(data00, data10); \ + data01 = POOL_OP(data01, data11); \ + data00 = POOL_OP(data00, data20); \ + data01 = POOL_OP(data01, data21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \ + }) + +DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) { - int start_x = get_global_id(0) * stride_x - pad_x; - int start_y = get_global_id(1) * stride_y - pad_y; - int end_x = min(start_x + pool_size, upper_bound_w); - int end_y = min(start_y + pool_size, upper_bound_h); + const int start_x = get_global_id(0) * stride_x - pad_x; + const int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size, upper_bound_w); + const int end_y = min(start_y + pool_size, upper_bound_h); return 1.f / ((end_y - start_y) * (end_x - start_x)); } +VEC_DATA_TYPE(DATA_TYPE, 4) +calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + const int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x; + const int start_y = get_global_id(1) * stride_y - pad_y; + const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w); + const int end_y = min(start_y + pool_size, upper_bound_h); + return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x)); +} + /** Performs a pooling function of pool size equal to 2. * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; - * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -52,7 +173,7 @@ float calculate_avg_scale(const int pool_size, const int upper_bound_w, const in * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) @@ -60,18 +181,10 @@ float calculate_avg_scale(const int pool_size, const int upper_bound_w, const in * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) - * @param[in] strides The pooling operation strides in each dimension - * @param[in] paddings The pooling operation paddings in each dimension */ __kernel void pooling_layer_2( TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output) -#ifdef POOL_AVG - , - int2 max_dims, int2 strides, int2 paddings -#endif /* POOL_AVG */ -) + TENSOR3D_DECLARATION(output)) { // Get pixels pointer Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); @@ -89,19 +202,23 @@ __kernel void pooling_layer_2( // Divide by pool region in case of average pooling #ifdef POOL_AVG - res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); + res *= calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #endif /* POOL_AVG */ // Store result *(__global DATA_TYPE *)output.ptr = res; } -/** Performs a pooling function of pool size equal to 3. +/** Performs a pooling function of pool size equal to 3 * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; - * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -109,7 +226,7 @@ __kernel void pooling_layer_2( * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) @@ -117,18 +234,10 @@ __kernel void pooling_layer_2( * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) - * @param[in] strides The pooling operation strides in each dimension - * @param[in] paddings The pooling operation paddings in each dimension */ __kernel void pooling_layer_3( TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output) -#ifdef POOL_AVG - , - int2 max_dims, int2 strides, int2 paddings -#endif /* POOL_AVG */ -) + TENSOR3D_DECLARATION(output)) { // Get pixels pointer Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); @@ -149,19 +258,73 @@ __kernel void pooling_layer_3( // Divide by pool region in case of average pooling #ifdef POOL_AVG - res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); -#endif /* POOL_AVG */ + res *= calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif //POOL_AVG // Store result *(__global DATA_TYPE *)output.ptr = res; } +#if defined(POOLING3x3) +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void pooling_layer_3_optimized( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 4) + res; + + // Perform pooling 3x3 for 4 output elements + POOLING3x3(res, input, output); + + // Divide by pool region in case of average pooling +#ifdef POOL_AVG + res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif // POOL_AVG + + vstore4(res, 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(POOLING3x3) + /** Performs a pooling function of pool size equal to 7. * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; - * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -169,7 +332,7 @@ __kernel void pooling_layer_3( * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) @@ -177,18 +340,10 @@ __kernel void pooling_layer_3( * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) - * @param[in] strides The pooling operation strides in each dimension - * @param[in] paddings The pooling operation paddings in each dimension */ __kernel void pooling_layer_7( TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output) -#ifdef POOL_AVG - , - int2 max_dims, int2 strides, int2 paddings -#endif /* POOL_AVG */ -) + TENSOR3D_DECLARATION(output)) { // Get pixels pointer Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); @@ -234,7 +389,7 @@ __kernel void pooling_layer_7( // Divide by pool region in case of average pooling #ifdef POOL_AVG - res *= calculate_avg_scale(7, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); + res *= calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #endif /* POOL_AVG */ // Store result diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index ca75fd56fb..6b2e881e68 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -41,7 +41,7 @@ using namespace arm_compute; CLPoolingLayerKernel::CLPoolingLayerKernel() - : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0) + : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1) { } @@ -92,11 +92,21 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); - const int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size; - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where + // each thread computes 4 output elements + const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3); + + int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size; + if(is_pool3x3_stride_le3) + { + // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3 + _num_elems_processed_per_iteration = 4; + num_elements_read_per_iteration = pool_size * (pool_stride_x + 1); + } + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; // Set instance variables _input = input; @@ -110,49 +120,31 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, std::set<std::string> build_opts; build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG")))); + build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x))); + if(pool_type == PoolingType::AVG) + { + build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x))); + build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y))); + build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y))); + build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x))); + build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y))); + } // Create kernel std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size); - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); - - // Set static kernel arguments - if(pool_type == PoolingType::AVG) + if(is_pool3x3_stride_le3) + { + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts)); + } + else { - // Create static kernel arguments - const cl_int2 max_dims = - { - { - static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x, - static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y, - } - }; - const cl_int2 strides = - { - { - pool_stride_x, - pool_stride_y, - } - }; - const cl_int2 paddings = - { - { - pool_pad_x, - pool_pad_y, - } - }; - - // Set static kernel arguments - unsigned int idx = 2 * num_arguments_per_3D_tensor(); - _kernel.setArg<cl_int2>(idx++, max_dims); - _kernel.setArg<cl_int2>(idx++, strides); - _kernel.setArg<cl_int2>(idx++, paddings); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); } // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 1; - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration)); AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration); update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); ICLKernel::configure(win); @@ -174,7 +166,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) { // Upsample input by pool size Window in_slice(slice); - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x)); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration)); in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y)); // Set inputs |