From cb29283e0d65297f4756e202df07eac1107841e6 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 2 Aug 2017 13:19:48 +0100 Subject: COMPMID-477 - Optimizing Pooling 3x3 with stride_x <= 3 on OpenCL Change-Id: Ie000166307cdb5bfae00ebf84d35e49a6bfb9dbd Reviewed-on: http://mpd-gerrit.cambridge.arm.com/83372 Tested-by: Kaizen Reviewed-by: Pablo Tello Reviewed-by: Anthony Barbier --- src/core/CL/cl_kernels/pooling_layer.cl | 255 +++++++++++++++++++++++++------- 1 file changed, 205 insertions(+), 50 deletions(-) (limited to 'src/core/CL/cl_kernels/pooling_layer.cl') diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl index b7245203d4..06989aa15e 100644 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ b/src/core/CL/cl_kernels/pooling_layer.cl @@ -29,22 +29,143 @@ #define POOL_OP(x, y) (fmax((x), (y))) #endif /* POOL_AVG */ -float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) +#if STRIDE_X == 1 +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output) +#elif STRIDE_X == 2 /* STRIDE_X == 1 */ +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output) +#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */ +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output) +#endif /* STRIDE_X == 3 */ + +#define CONVERT_OP(data_type) convert_##data_type##4 +#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type) + +#define POOLING3x3_STRIDE1(res, input, output) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \ + \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \ + \ + values00 = POOL_OP(values00, values10); \ + values01 = POOL_OP(values01, values11); \ + values00 = POOL_OP(values00, values20); \ + values01 = POOL_OP(values01, values21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \ + }) + +#define POOLING3x3_STRIDE2(res, input, output) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21); \ + \ + values00 = POOL_OP(values00, values10); \ + values01 = POOL_OP(values01, values11); \ + values00 = POOL_OP(values00, values20); \ + values01 = POOL_OP(values01, values21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \ + }) + +#define POOLING3x3_STRIDE3(res, input, output) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \ + VEC_DATA_TYPE(DATA_TYPE, 8) \ + data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + \ + data00 = POOL_OP(data00, data10); \ + data01 = POOL_OP(data01, data11); \ + data00 = POOL_OP(data00, data20); \ + data01 = POOL_OP(data01, data21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \ + }) + +DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) { - int start_x = get_global_id(0) * stride_x - pad_x; - int start_y = get_global_id(1) * stride_y - pad_y; - int end_x = min(start_x + pool_size, upper_bound_w); - int end_y = min(start_y + pool_size, upper_bound_h); + const int start_x = get_global_id(0) * stride_x - pad_x; + const int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size, upper_bound_w); + const int end_y = min(start_y + pool_size, upper_bound_h); return 1.f / ((end_y - start_y) * (end_x - start_x)); } +VEC_DATA_TYPE(DATA_TYPE, 4) +calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + const int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x; + const int start_y = get_global_id(1) * stride_y - pad_y; + const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w); + const int end_y = min(start_y + pool_size, upper_bound_h); + return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x)); +} + /** Performs a pooling function of pool size equal to 2. * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; - * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -52,7 +173,7 @@ float calculate_avg_scale(const int pool_size, const int upper_bound_w, const in * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) @@ -60,18 +181,10 @@ float calculate_avg_scale(const int pool_size, const int upper_bound_w, const in * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) - * @param[in] strides The pooling operation strides in each dimension - * @param[in] paddings The pooling operation paddings in each dimension */ __kernel void pooling_layer_2( TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output) -#ifdef POOL_AVG - , - int2 max_dims, int2 strides, int2 paddings -#endif /* POOL_AVG */ -) + TENSOR3D_DECLARATION(output)) { // Get pixels pointer Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); @@ -89,19 +202,23 @@ __kernel void pooling_layer_2( // Divide by pool region in case of average pooling #ifdef POOL_AVG - res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); + res *= calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #endif /* POOL_AVG */ // Store result *(__global DATA_TYPE *)output.ptr = res; } -/** Performs a pooling function of pool size equal to 3. +/** Performs a pooling function of pool size equal to 3 * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; - * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -109,7 +226,7 @@ __kernel void pooling_layer_2( * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) @@ -117,18 +234,10 @@ __kernel void pooling_layer_2( * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) - * @param[in] strides The pooling operation strides in each dimension - * @param[in] paddings The pooling operation paddings in each dimension */ __kernel void pooling_layer_3( TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output) -#ifdef POOL_AVG - , - int2 max_dims, int2 strides, int2 paddings -#endif /* POOL_AVG */ -) + TENSOR3D_DECLARATION(output)) { // Get pixels pointer Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); @@ -149,19 +258,73 @@ __kernel void pooling_layer_3( // Divide by pool region in case of average pooling #ifdef POOL_AVG - res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); -#endif /* POOL_AVG */ + res *= calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif //POOL_AVG // Store result *(__global DATA_TYPE *)output.ptr = res; } +#if defined(POOLING3x3) +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void pooling_layer_3_optimized( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 4) + res; + + // Perform pooling 3x3 for 4 output elements + POOLING3x3(res, input, output); + + // Divide by pool region in case of average pooling +#ifdef POOL_AVG + res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif // POOL_AVG + + vstore4(res, 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(POOLING3x3) + /** Performs a pooling function of pool size equal to 7. * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; - * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed. + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16, F32 + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) @@ -169,7 +332,7 @@ __kernel void pooling_layer_3( * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16, F32 + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) @@ -177,18 +340,10 @@ __kernel void pooling_layer_3( * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] max_dims The maximum index that can be accessed in x and y dimension (width + pad) - * @param[in] strides The pooling operation strides in each dimension - * @param[in] paddings The pooling operation paddings in each dimension */ __kernel void pooling_layer_7( TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output) -#ifdef POOL_AVG - , - int2 max_dims, int2 strides, int2 paddings -#endif /* POOL_AVG */ -) + TENSOR3D_DECLARATION(output)) { // Get pixels pointer Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); @@ -234,7 +389,7 @@ __kernel void pooling_layer_7( // Divide by pool region in case of average pooling #ifdef POOL_AVG - res *= calculate_avg_scale(7, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y); + res *= calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); #endif /* POOL_AVG */ // Store result -- cgit v1.2.1