From 0c687044c3f5d7f294858debfc7c4c070228a9b4 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 14 Jun 2022 15:13:16 +0100 Subject: Fix performance regression in Winograd Output Transform (OpenCL) The regression was caused by NUM_TILES_X passed at runtime. Resolves COMPMID-5327 Change-Id: Id6ccd93784eda93af09f420c0d786050e2bbccd7 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7727 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../cl_kernels/nhwc/winograd_output_transform.cl | 58 +++++++--------------- .../cl/kernels/ClWinogradOutputTransformKernel.cpp | 2 +- 2 files changed, 18 insertions(+), 42 deletions(-) diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl index ed6da9fd12..bab2ee850c 100644 --- a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl +++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl @@ -61,7 +61,6 @@ * @param[in] _ISRC_HEIGHT The source tensor's height * @param[in] _IDST_WIDTH The destination tensor's width * @param[in] _IDST_HEIGHT The destination tensor's height - * @param[in] _INUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_2x2_7x7_nhwc( TENSOR4D(src, BUFFER), @@ -72,15 +71,14 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc( int dst_size, const int _ISRC_HEIGHT, const int _IDST_WIDTH, - const int _IDST_HEIGHT, - const int _INUM_TILES_X) + const int _IDST_HEIGHT) { const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; - int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) TILE(DATA_TYPE, 8, N0, in); @@ -240,7 +238,6 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_4x4_3x3_nhwc( TENSOR4D(src, BUFFER), @@ -251,8 +248,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES @@ -439,7 +435,6 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_4x4_5x5_nhwc( TENSOR4D(src, BUFFER), @@ -450,8 +445,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES @@ -656,7 +650,6 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_2x1_7x1_nhwc( TENSOR4D_DECLARATION(src), @@ -667,8 +660,7 @@ __kernel void winograd_output_transform_2x1_7x1_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { winograd_output_transform_2x2_7x7_nhwc(src_ptr, src_stride_x, @@ -699,8 +691,7 @@ __kernel void winograd_output_transform_2x1_7x1_nhwc( dst_size, SRC_HEIGHT, DST_WIDTH, - DST_HEIGHT, - NUM_TILES_X); + DST_HEIGHT); } #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) #endif // defined(VEC_SIZE) && VEC_SIZE == 2 @@ -739,7 +730,6 @@ __kernel void winograd_output_transform_2x1_7x1_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_4x1_3x1_nhwc( TENSOR4D_DECLARATION(src), @@ -750,8 +740,7 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { winograd_output_transform_4x4_3x3_nhwc(src_ptr, src_stride_x, @@ -782,8 +771,7 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc( dst_size, SRC_HEIGHT, DST_WIDTH, - DST_HEIGHT, - NUM_TILES_X); + DST_HEIGHT); } #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) @@ -820,7 +808,6 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_4x1_5x1_nhwc( TENSOR4D_DECLARATION(src), @@ -831,8 +818,7 @@ __kernel void winograd_output_transform_4x1_5x1_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { winograd_output_transform_4x4_5x5_nhwc(src_ptr, src_stride_x, @@ -863,8 +849,7 @@ __kernel void winograd_output_transform_4x1_5x1_nhwc( dst_size, SRC_HEIGHT, DST_WIDTH, - DST_HEIGHT, - NUM_TILES_X); + DST_HEIGHT); } #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) #endif // defined(VEC_SIZE) && VEC_SIZE == 4 @@ -905,7 +890,6 @@ __kernel void winograd_output_transform_4x1_5x1_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_1x2_1x7_nhwc( TENSOR4D_DECLARATION(src), @@ -916,8 +900,7 @@ __kernel void winograd_output_transform_1x2_1x7_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { winograd_output_transform_2x2_7x7_nhwc(src_ptr, src_stride_x, @@ -948,8 +931,7 @@ __kernel void winograd_output_transform_1x2_1x7_nhwc( dst_size, SRC_HEIGHT, DST_WIDTH, - DST_HEIGHT, - NUM_TILES_X); + DST_HEIGHT); } #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC) #endif // defined(VEC_SIZE) && VEC_SIZE == 2 @@ -988,7 +970,6 @@ __kernel void winograd_output_transform_1x2_1x7_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_1x4_1x3_nhwc( TENSOR4D_DECLARATION(src), @@ -999,8 +980,7 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { winograd_output_transform_4x4_3x3_nhwc(src_ptr, src_stride_x, @@ -1031,8 +1011,7 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc( dst_size, SRC_HEIGHT, DST_WIDTH, - DST_HEIGHT, - NUM_TILES_X); + DST_HEIGHT); } #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC) @@ -1069,7 +1048,6 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc( * @param[in] SRC_HEIGHT The source tensor's height * @param[in] DST_WIDTH The destination tensor's width * @param[in] DST_HEIGHT The destination tensor's height - * @param[in] NUM_TILES_X The number of tiles along the X direction */ __kernel void winograd_output_transform_1x4_1x5_nhwc( TENSOR4D_DECLARATION(src), @@ -1080,8 +1058,7 @@ __kernel void winograd_output_transform_1x4_1x5_nhwc( int dst_size, const int SRC_HEIGHT, const int DST_WIDTH, - const int DST_HEIGHT, - const int NUM_TILES_X) + const int DST_HEIGHT) { winograd_output_transform_4x4_5x5_nhwc(src_ptr, src_stride_x, @@ -1112,8 +1089,7 @@ __kernel void winograd_output_transform_1x4_1x5_nhwc( dst_size, SRC_HEIGHT, DST_WIDTH, - DST_HEIGHT, - NUM_TILES_X); + DST_HEIGHT); } #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC) #endif // defined(VEC_SIZE) && VEC_SIZE == 4 diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index ff57c83959..a664d1ec1d 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -187,6 +187,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); } else { @@ -279,7 +280,6 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg(idx2++, _src_height); _kernel.setArg(idx2++, _dst_width); _kernel.setArg(idx2++, _dst_height); - _kernel.setArg(idx2++, _num_tiles_x); } do -- cgit v1.2.1