aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2022-06-14 15:13:16 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2022-06-15 14:47:02 +0000
commit0c687044c3f5d7f294858debfc7c4c070228a9b4 (patch)
treeed0794cc6e2b56e40eda7aa44e83139c46241213
parent894659a98e76d84bf209da27d8ecb6d9ed05b13d (diff)
downloadComputeLibrary-0c687044c3f5d7f294858debfc7c4c070228a9b4.tar.gz
Fix performance regression in Winograd Output Transform (OpenCL)
The regression was caused by NUM_TILES_X passed at runtime. Resolves COMPMID-5327 Change-Id: Id6ccd93784eda93af09f420c0d786050e2bbccd7 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7727 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl58
-rw-r--r--src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp2
2 files changed, 18 insertions, 42 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
index ed6da9fd12..bab2ee850c 100644
--- a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -61,7 +61,6 @@
* @param[in] _ISRC_HEIGHT The source tensor's height
* @param[in] _IDST_WIDTH The destination tensor's width
* @param[in] _IDST_HEIGHT The destination tensor's height
- * @param[in] _INUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_2x2_7x7_nhwc(
TENSOR4D(src, BUFFER),
@@ -72,15 +71,14 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
int dst_size,
const int _ISRC_HEIGHT,
const int _IDST_WIDTH,
- const int _IDST_HEIGHT,
- const int _INUM_TILES_X)
+ const int _IDST_HEIGHT)
{
const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
- int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
- int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
TILE(DATA_TYPE, 8, N0, in);
@@ -240,7 +238,6 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_4x4_3x3_nhwc(
TENSOR4D(src, BUFFER),
@@ -251,8 +248,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
@@ -439,7 +435,6 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_4x4_5x5_nhwc(
TENSOR4D(src, BUFFER),
@@ -450,8 +445,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
@@ -656,7 +650,6 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_2x1_7x1_nhwc(
TENSOR4D_DECLARATION(src),
@@ -667,8 +660,7 @@ __kernel void winograd_output_transform_2x1_7x1_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
winograd_output_transform_2x2_7x7_nhwc(src_ptr,
src_stride_x,
@@ -699,8 +691,7 @@ __kernel void winograd_output_transform_2x1_7x1_nhwc(
dst_size,
SRC_HEIGHT,
DST_WIDTH,
- DST_HEIGHT,
- NUM_TILES_X);
+ DST_HEIGHT);
}
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
@@ -739,7 +730,6 @@ __kernel void winograd_output_transform_2x1_7x1_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_4x1_3x1_nhwc(
TENSOR4D_DECLARATION(src),
@@ -750,8 +740,7 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
winograd_output_transform_4x4_3x3_nhwc(src_ptr,
src_stride_x,
@@ -782,8 +771,7 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc(
dst_size,
SRC_HEIGHT,
DST_WIDTH,
- DST_HEIGHT,
- NUM_TILES_X);
+ DST_HEIGHT);
}
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
@@ -820,7 +808,6 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_4x1_5x1_nhwc(
TENSOR4D_DECLARATION(src),
@@ -831,8 +818,7 @@ __kernel void winograd_output_transform_4x1_5x1_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
winograd_output_transform_4x4_5x5_nhwc(src_ptr,
src_stride_x,
@@ -863,8 +849,7 @@ __kernel void winograd_output_transform_4x1_5x1_nhwc(
dst_size,
SRC_HEIGHT,
DST_WIDTH,
- DST_HEIGHT,
- NUM_TILES_X);
+ DST_HEIGHT);
}
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
#endif // defined(VEC_SIZE) && VEC_SIZE == 4
@@ -905,7 +890,6 @@ __kernel void winograd_output_transform_4x1_5x1_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_1x2_1x7_nhwc(
TENSOR4D_DECLARATION(src),
@@ -916,8 +900,7 @@ __kernel void winograd_output_transform_1x2_1x7_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
winograd_output_transform_2x2_7x7_nhwc(src_ptr,
src_stride_x,
@@ -948,8 +931,7 @@ __kernel void winograd_output_transform_1x2_1x7_nhwc(
dst_size,
SRC_HEIGHT,
DST_WIDTH,
- DST_HEIGHT,
- NUM_TILES_X);
+ DST_HEIGHT);
}
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
@@ -988,7 +970,6 @@ __kernel void winograd_output_transform_1x2_1x7_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_1x4_1x3_nhwc(
TENSOR4D_DECLARATION(src),
@@ -999,8 +980,7 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
winograd_output_transform_4x4_3x3_nhwc(src_ptr,
src_stride_x,
@@ -1031,8 +1011,7 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc(
dst_size,
SRC_HEIGHT,
DST_WIDTH,
- DST_HEIGHT,
- NUM_TILES_X);
+ DST_HEIGHT);
}
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
@@ -1069,7 +1048,6 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc(
* @param[in] SRC_HEIGHT The source tensor's height
* @param[in] DST_WIDTH The destination tensor's width
* @param[in] DST_HEIGHT The destination tensor's height
- * @param[in] NUM_TILES_X The number of tiles along the X direction
*/
__kernel void winograd_output_transform_1x4_1x5_nhwc(
TENSOR4D_DECLARATION(src),
@@ -1080,8 +1058,7 @@ __kernel void winograd_output_transform_1x4_1x5_nhwc(
int dst_size,
const int SRC_HEIGHT,
const int DST_WIDTH,
- const int DST_HEIGHT,
- const int NUM_TILES_X)
+ const int DST_HEIGHT)
{
winograd_output_transform_4x4_5x5_nhwc(src_ptr,
src_stride_x,
@@ -1112,8 +1089,7 @@ __kernel void winograd_output_transform_1x4_1x5_nhwc(
dst_size,
SRC_HEIGHT,
DST_WIDTH,
- DST_HEIGHT,
- NUM_TILES_X);
+ DST_HEIGHT);
}
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
#endif // defined(VEC_SIZE) && VEC_SIZE == 4
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index ff57c83959..a664d1ec1d 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -187,6 +187,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
}
else
{
@@ -279,7 +280,6 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
_kernel.setArg<cl_int>(idx2++, _src_height);
_kernel.setArg<cl_int>(idx2++, _dst_width);
_kernel.setArg<cl_int>(idx2++, _dst_height);
- _kernel.setArg<cl_int>(idx2++, _num_tiles_x);
}
do