aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorramelg01 <ramy.elgammal@arm.com>2022-02-04 20:49:14 +0000
committerRamy Elgammal <ramy.elgammal@arm.com>2022-02-09 10:14:47 +0000
commit2a86a30a5d9c047b0ec73a699b09a128f3fcb55e (patch)
tree4421ab41ba31bd916a0331b1011184110bd51ff4
parentba8690b4f0f82dfb66e3da819361e4032e9fa4db (diff)
downloadComputeLibrary-2a86a30a5d9c047b0ec73a699b09a128f3fcb55e.tar.gz
Improve start-up time for winograd_input_transform_*_nhwc
- pass tensor's dimensions at runtime rather than compile time - Add guard macro to compile only kernel(s) of internest Resolves: COMPMID-5119 Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com> Change-Id: Ib01098e397011a1201c2800c62a8954ec70e63e8 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7083 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl183
-rw-r--r--src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp16
-rw-r--r--src/gpu/cl/kernels/ClWinogradInputTransformKernel.h4
3 files changed, 147 insertions, 56 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
index 4865982a55..ba7b13b774 100644
--- a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,16 +62,16 @@
out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
})
-#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+#if defined(NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -99,23 +99,24 @@
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
x -= PAD_LEFT;
@@ -234,14 +235,15 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_3X3_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -269,23 +271,24 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
x -= PAD_LEFT;
@@ -403,14 +406,15 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X4_5X5_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -438,23 +442,24 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
x -= PAD_LEFT;
@@ -577,14 +582,15 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X2_7X7_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC) || defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -612,11 +618,19 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -637,16 +651,21 @@ __kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
dst_step_z,
dst_stride_w,
dst_step_w,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_3X1_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -674,11 +693,19 @@ __kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -699,16 +726,21 @@ __kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
dst_step_z,
dst_stride_w,
dst_step_w,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_4X1_5X1_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -736,11 +768,19 @@ __kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -761,16 +801,22 @@ __kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
dst_step_z,
dst_stride_w,
dst_step_w,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_2X1_7X1_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ *
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -798,11 +844,19 @@ __kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -823,16 +877,21 @@ __kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
dst_step_z,
dst_stride_w,
dst_step_w,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X3_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -860,11 +919,19 @@ __kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -885,16 +952,21 @@ __kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
dst_step_z,
dst_stride_w,
dst_step_w,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X4_1X5_STEPZ1_NHWC)
+#if defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
//! @cond Doxygen_Suppress
/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
*
* @note Data layout supported: NHWC
* @note Data type supported: F32/F16
* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
* @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
* @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
@@ -922,11 +994,19 @@ __kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] _ISRC_WIDTH The src tensor's width
+ * @param[in] _ISRC_HEIGHT The src tensor's height
+ * @param[in] _INUM_TILES_X The number of tiles in the X dimension
+ * @param[in] _INUM_TILES_Y The number of tiles in the Y dimension
*/
//! @endcond
__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
+ TENSOR4D(dst, BUFFER),
+ const int _ISRC_WIDTH,
+ const int _ISRC_HEIGHT,
+ const int _INUM_TILES_X,
+ const int _INUM_TILES_Y)
{
winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
src_stride_x,
@@ -947,7 +1027,12 @@ __kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
dst_step_z,
dst_stride_w,
dst_step_w,
- dst_offset_first_element_in_bytes);
+ dst_offset_first_element_in_bytes,
+ _ISRC_WIDTH,
+ _ISRC_HEIGHT,
+ _INUM_TILES_X,
+ _INUM_TILES_Y);
}
-#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_1X2_1X7_STEPZ1_NHWC)
+#endif // defined(NHWC)
#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index 58874216bb..d6b038f0f8 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -147,10 +147,8 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
if(_data_layout == DataLayout::NHWC)
{
build_opts.add_option("-DNHWC");
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_w)));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_h)));
- build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
- build_opts.add_option("-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
+ _src_width = src->dimension(idx_w);
+ _src_height = src->dimension(idx_h);
build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
@@ -189,6 +187,8 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
kernel_name += support::cpp11::to_string(_step_z);
kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
+ // A macro guard to compile ONLY the kernel of interest
+ build_opts.add_option("-D" + upper_string(kernel_name));
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
// Create window and update padding
@@ -247,6 +247,10 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
unsigned int idx = 0;
add_4D_tensor_argument(idx, src, slice);
add_4D_tensor_argument(idx, dst, slice);
+ _kernel.setArg<cl_uint>(idx++, _src_width);
+ _kernel.setArg<cl_uint>(idx++, _src_height);
+ _kernel.setArg<cl_uint>(idx++, _num_tiles_x);
+ _kernel.setArg<cl_uint>(idx++, _num_tiles_y);
enqueue(queue, *this, slice, lws_hint());
}
else
@@ -275,4 +279,4 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
}
} // namespace kernels
} // namespace opencl
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
index 631f427b82..c10c528b9b 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,6 +80,8 @@ private:
int _num_tiles_x{ 0 };
int _num_tiles_y{ 0 };
unsigned int _step_z{ 1 };
+ int32_t _src_width{ 0 };
+ int32_t _src_height{ 0 };
};
} // namespace kernels
} // namespace opencl