From 598e3a8f32c68129958b9f6a40c684842f708f8a Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 13 Apr 2021 15:53:20 +0100 Subject: Fix TILE initialization in direct convolution and winograd transforms - The array initializer for the TILE object cannot always be utilized and so we do require to manually initialize the TILE with the LOOP_UNROLLING macro - Resolves COMPMID-4371 Change-Id: I2598354b9fae84c5e3bd11219fffdcdc297215e1 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5417 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/direct_convolution.cl | 7 +- src/core/CL/cl_kernels/winograd_input_transform.cl | 112 +++++++++++++-------- .../CL/cl_kernels/winograd_output_transform.cl | 74 ++++++++++---- 3 files changed, 132 insertions(+), 61 deletions(-) diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl index 220179effb..a9a997f9ac 100644 --- a/src/core/CL/cl_kernels/direct_convolution.cl +++ b/src/core/CL/cl_kernels/direct_convolution.cl @@ -150,7 +150,12 @@ __kernel void direct_convolution_nhwc( } // Initialize the accumulators - TILE(ACC_DATA_TYPE, M0, N0, c) = { { { 0 } } }; + TILE(ACC_DATA_TYPE, M0, N0, c); + + LOOP_UNROLLING(int, i, 0, M0, 1) + { + c[i].v = 0; + } for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl index 60750f054d..932e1643fd 100644 --- a/src/core/CL/cl_kernels/winograd_input_transform.cl +++ b/src/core/CL/cl_kernels/winograd_input_transform.cl @@ -944,7 +944,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -967,10 +967,14 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 6, 1, in) = { { { 0 } } }; - TILE(DATA_TYPE, 6, 1, out) = { { { 0 } } }; - TILE(int, 6, 1, src_indirect_y) = { { { 0 } } }; - TILE(int, 6, 1, src_indirect_mask) = { { { 0 } } }; + TILE(DATA_TYPE, 6, 1, in); + TILE(DATA_TYPE, 6, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 6, 1) + { + in[i].v = 0; + } #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); @@ -978,7 +982,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - TILE(DATA_TYPE, 6, 1, com) = { { { 0 } } }; + TILE(DATA_TYPE, 6, 1, com); LOOP_UNROLLING(int, i, 0, 6, 1) { @@ -1000,7 +1004,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( out[4].v = com[5].v - com[4].v; out[5].v = com[3].v - com[1].v; - TILE(uint, 6, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 6, 1, dst_indirect_y); LOOP_UNROLLING(int, i, 0, 6, 1) { @@ -1012,13 +1016,19 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( #else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 36, 1, in) = { { { 0 } } }; + TILE(DATA_TYPE, 36, 1, in); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 36, 1) + { + in[i].v = 0; + } // Load the tile from a NHWC tensor T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); - TILE(DATA_TYPE, 6, 1, com) = { { { 0 } } }; - TILE(DATA_TYPE, 36, 1, tmp) = { { { 0 } } }; + TILE(DATA_TYPE, 6, 1, com); + TILE(DATA_TYPE, 36, 1, tmp); LOOP_UNROLLING(int, i, 0, 6, 1) { @@ -1037,7 +1047,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( tmp[i + 5 * 6].v = com[3].v - com[1].v; } - TILE(DATA_TYPE, 36, 1, out) = { { { 0 } } }; + TILE(DATA_TYPE, 36, 1, out); LOOP_UNROLLING(int, i, 0, 6, 1) { @@ -1057,7 +1067,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( } // Compute destination address - TILE(uint, 36, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 36, 1, dst_indirect_y); LOOP_UNROLLING(int, i, 0, 36, 1) { @@ -1104,7 +1114,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1127,10 +1137,14 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 8, 1, in) = { { { 0 } } }; - TILE(DATA_TYPE, 8, 1, out) = { { { 0 } } }; - TILE(int, 8, 1, src_indirect_y) = { { { 0 } } }; - TILE(int, 8, 1, src_indirect_mask) = { { { 0 } } }; + TILE(DATA_TYPE, 8, 1, in); + TILE(DATA_TYPE, 8, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 8, 1) + { + in[i].v = 0; + } #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); @@ -1138,7 +1152,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } }; + TILE(DATA_TYPE, 1, 8, com); com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v; com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v; @@ -1155,7 +1169,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( out[6].s[0] = com[0].s[4] - com[0].s[5]; out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v; - TILE(uint, 8, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 8, 1, dst_indirect_y); LOOP_UNROLLING(int, i, 0, 8, 1) { @@ -1167,13 +1181,19 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( #else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 64, 1, in) = { { { 0 } } }; - TILE(DATA_TYPE, 64, 1, out) = { { { 0 } } }; + TILE(DATA_TYPE, 64, 1, in); + TILE(DATA_TYPE, 64, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 64, 1) + { + in[i].v = 0; + } // Load the tile from a NHWC tensor T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); - TILE(DATA_TYPE, 8, 8, com) = { { { 0 } } }; + TILE(DATA_TYPE, 8, 8, com); LOOP_UNROLLING(int, i, 0, 8, 1) { @@ -1187,7 +1207,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0]; } - TILE(DATA_TYPE, 8, 8, tmp) = { { { 0 } } }; + TILE(DATA_TYPE, 8, 8, tmp); tmp[0].v = com[6].v; tmp[1].v = com[0].v + com[1].v; tmp[2].v = com[0].v - com[1].v; @@ -1215,7 +1235,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7]; } - TILE(uint, 64, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 64, 1, dst_indirect_y); LOOP_UNROLLING(int, i, 0, 64, 1) { @@ -1263,7 +1283,7 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1286,10 +1306,14 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 8, 1, in) = { { { 0 } } }; - TILE(DATA_TYPE, 8, 1, out) = { { { 0 } } }; - TILE(int, 8, 1, src_indirect_y) = { { { 0 } } }; - TILE(int, 8, 1, src_indirect_mask) = { { { 0 } } }; + TILE(DATA_TYPE, 8, 1, in); + TILE(DATA_TYPE, 8, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 8, 1) + { + in[i].v = 0; + } #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); @@ -1319,7 +1343,7 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( out[6].s[0] = com[0].s[4] + com[0].s[5]; out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v; - TILE(uint, 8, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 8, 1, dst_indirect_y); LOOP_UNROLLING(int, i, 0, 8, 1) { @@ -1331,13 +1355,19 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( #else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 64, 1, in) = { { { 0 } } }; - TILE(DATA_TYPE, 64, 1, out) = { { { 0 } } }; + TILE(DATA_TYPE, 64, 1, in); + TILE(DATA_TYPE, 64, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 64, 1) + { + in[i].v = 0; + } // Load the tile from a NHWC tensor T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); - TILE(DATA_TYPE, 8, 8, com) = { { { 0 } } }; + TILE(DATA_TYPE, 8, 8, com); LOOP_UNROLLING(int, i, 0, 8, 1) { @@ -1351,7 +1381,7 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0]; } - TILE(DATA_TYPE, 8, 8, tmp) = { { { 0 } } }; + TILE(DATA_TYPE, 8, 8, tmp); tmp[0].v = com[6].v; tmp[1].v = com[0].v - com[1].v; tmp[2].v = com[0].v + com[1].v; @@ -1379,7 +1409,7 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7]; } - TILE(uint, 64, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 64, 1, dst_indirect_y); LOOP_UNROLLING(int, i, 0, 64, 1) { @@ -1427,7 +1457,7 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1489,7 +1519,7 @@ __kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1551,7 +1581,7 @@ __kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1613,7 +1643,7 @@ __kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1675,7 +1705,7 @@ __kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) @@ -1737,7 +1767,7 @@ __kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc( * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ - //! @endcond +//! @endcond __kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc( TENSOR4D(src, BUFFER), TENSOR4D(dst, BUFFER)) diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl index f257825967..837e43419a 100644 --- a/src/core/CL/cl_kernels/winograd_output_transform.cl +++ b/src/core/CL/cl_kernels/winograd_output_transform.cl @@ -243,6 +243,12 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc( src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8); } + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 8, 1) + { + in[i].v = 0; + } + // Load the values across the 8 channels to compose the 8x1 tile T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); @@ -294,6 +300,12 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc( src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64); } + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 64, 1) + { + in[i].v = 0; + } + // Load the values across the 64 channels to compose the 8x8 tile T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); @@ -615,9 +627,9 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 6, N0, in) = { { { 0 } } }; - TILE(DATA_TYPE, 4, N0, out) = { { { 0 } } }; - TILE(uint, 6, 1, src_indirect_y) = { { { 0 } } }; + TILE(DATA_TYPE, 6, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(uint, 6, 1, src_indirect_y); LOOP_UNROLLING(int, i, 0, 6, 1) { @@ -625,6 +637,12 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6); } + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 6, 1) + { + in[i].v = 0; + } + // Load the values across the 36 channels to compose the 6x6 or 6x1 tile T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); @@ -648,7 +666,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - TILE(uint, 4, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 4, 1, dst_indirect_y); // Calculate the destination indirect Y #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) @@ -673,9 +691,9 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) // Calculate the indirect Y for the source tensor - TILE(DATA_TYPE, 36, N0, in) = { { { 0 } } }; - TILE(DATA_TYPE, 4, N0, tmp) = { { { 0 } } }; - TILE(uint, 36, 1, src_indirect_y) = { { { 0 } } }; + TILE(DATA_TYPE, 36, N0, in); + TILE(DATA_TYPE, 4, N0, tmp); + TILE(uint, 36, 1, src_indirect_y); LOOP_UNROLLING(int, i, 0, 36, 1) { @@ -683,6 +701,12 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36); } + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 36, 1) + { + in[i].v = 0; + } + // Load the values across the 36 channels to compose the 6x6 or 6x1 tile T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); @@ -700,7 +724,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( } // Compute the output tile - TILE(DATA_TYPE, 16, N0, out) = { { { 0 } } }; + TILE(DATA_TYPE, 16, N0, out); LOOP_UNROLLING(int, i, 0, 4, 1) { @@ -729,7 +753,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - TILE(uint, 16, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 16, 1, dst_indirect_y); // Calculate the destination indirect Y LOOP_UNROLLING(int, yk, 0, 4, 1) @@ -1043,10 +1067,10 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 8, N0, in) = { { { 0 } } }; - TILE(DATA_TYPE, 4, N0, out) = { { { 0 } } }; - TILE(DATA_TYPE, 4, N0, tmp) = { { { 0 } } }; - TILE(uint, 8, 1, src_indirect_y) = { { { 0 } } }; + TILE(DATA_TYPE, 8, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(DATA_TYPE, 4, N0, tmp); + TILE(uint, 8, 1, src_indirect_y); LOOP_UNROLLING(int, i, 0, 8, 1) { @@ -1054,6 +1078,12 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8); } + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 8, 1) + { + in[i].v = 0; + } + // "in" contains 1x8 or 8x1 tile here T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); @@ -1081,7 +1111,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - TILE(uint, 4, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 4, 1, dst_indirect_y); // Calculate the destination indirect Y #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) @@ -1105,9 +1135,9 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) // Calculate the indirect Y for the source tensor - TILE(DATA_TYPE, 64, N0, in) = { { { 0 } } }; - TILE(DATA_TYPE, 6, N0, tmp) = { { { 0 } } }; - TILE(uint, 64, 1, src_indirect_y) = { { { 0 } } }; + TILE(DATA_TYPE, 64, N0, in); + TILE(DATA_TYPE, 6, N0, tmp); + TILE(uint, 64, 1, src_indirect_y); LOOP_UNROLLING(int, i, 0, 64, 1) { @@ -1115,6 +1145,12 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64); } + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 64, 1) + { + in[i].v = 0; + } + // "in" here is 8x8 tile T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); @@ -1138,7 +1174,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( } // Compute the output tile - TILE(DATA_TYPE, 16, N0, out) = { { { 0 } } }; + TILE(DATA_TYPE, 16, N0, out); // in * A, with in = A^T * in as above LOOP_UNROLLING(int, i, 0, 4, 1) @@ -1173,7 +1209,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - TILE(uint, 16, 1, dst_indirect_y) = { { { 0 } } }; + TILE(uint, 16, 1, dst_indirect_y); // Calculate the destination indirect Y LOOP_UNROLLING(int, yk, 0, 4, 1) -- cgit v1.2.1