4 files changed, 166 insertions, 42 deletions
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index 9289cb0026..34bf2902e8 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -44,7 +44,6 @@
     })
 
 #if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-#if defined(SRC_DEPTH)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -82,11 +81,19 @@ __kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
     const int z = get_global_id(2) % SRC_DEPTH;
     const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
+#if defined(SRC_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
@@ -146,7 +153,11 @@ __kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
     DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
+#if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
 
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
@@ -206,12 +217,19 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
     const int z = (get_global_id(2) * 2) % SRC_DEPTH;
     const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
+#if defined(SRC_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
     src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
@@ -317,7 +335,11 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
     out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
+#if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
 
     vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
     vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
@@ -377,11 +399,19 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
     const int z = get_global_id(2) % SRC_DEPTH;
     const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
+#if defined(SRC_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
@@ -462,7 +492,11 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Compute destination address
+#if defined(SRC_DEPTH)
     __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
 
     uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
 
@@ -690,12 +724,19 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
     const int z = get_global_id(2) % SRC_DEPTH;
     const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
+#if defined(SRC_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
     src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
     // Load input tile
@@ -773,7 +814,11 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Store values across the channels
+#if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
 
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
@@ -843,9 +888,8 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
     *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
-#endif // defined(SRC_DEPTH)
 
-#if defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
 /** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -885,10 +929,18 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
     const int z = get_global_id(2) % NUM_TILES_Y;
     const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
 
+#if defined(NUM_TILES_Y)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
 
     // Clamp coordinates. This clamp is valid for all rows
     int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
@@ -1041,9 +1093,13 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Compute destination address
-    __global DATA_TYPE *dst_addr                              = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z *
-                                                                                       (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
-    uint                                     dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+#if defined(NUM_TILES_Y)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(NUM_TILES_Y) */
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(NUM_TILES_Y) */
+
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
 
     *((__global DATA_TYPE *)dst_addr) = out0;
     dst_addr += dst_plane_stride;
@@ -1273,11 +1329,19 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
     const int z = get_global_id(2) % NUM_TILES_Y;
     const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
 
     // Compute input address
+#if defined(NUM_TILES_Y)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
     // Clamp coordinates. This clamp is valid for all rows
@@ -1509,10 +1573,14 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
     OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Store values across the channels
+#if defined(NUM_TILES_Y)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* NUM_TILES_Y */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
 
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
@@ -1582,10 +1650,9 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
     *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
-#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-#if defined(SRC_DEPTH)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -1801,9 +1868,8 @@ __kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
                                                  src_stride_w,
                                                  dst_stride_w);
 }
-#endif // defined(SRC_DEPTH)
 
-#if defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -1919,7 +1985,6 @@ __kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-#if defined(SRC_DEPTH)
 /** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -2135,9 +2200,8 @@ __kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
                                                  src_stride_w,
                                                  dst_stride_w);
 }
-#endif // defined(SRC_DEPTH)
 
-#if defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
 /** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -2249,6 +2313,6 @@ __kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
                                                  src_stride_w,
                                                  dst_stride_w);
 }
-#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 #endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
 \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index bae40f3762..2c7c05fdd1 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-#if defined(SRC_DEPTH) && defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
 /** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
@@ -64,9 +64,13 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
 )
 {
     // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
@@ -123,7 +127,9 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
     int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
     int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
     int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
 #if defined(HAS_BIAS)
     // Add bias
@@ -136,7 +142,11 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
 #endif // defined(HAS_BIAS)
 
     // Get output address
+#if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -197,9 +207,13 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
 )
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the 6x6 or 6x1 tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
@@ -312,7 +326,9 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
     int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
     int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
     int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
 #if defined(HAS_BIAS)
     // Add bias
@@ -327,7 +343,11 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
 #endif // defined(HAS_BIAS)
 
     // Get output address
+#if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -403,9 +423,13 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     int dst_size)
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
@@ -519,7 +543,9 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     int x_out = get_global_id(0);
     int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
     int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
 #if defined(HAS_BIAS)
     // Add bias
@@ -551,8 +577,12 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
 #endif // defined(HAS_BIAS)
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(SRC_DEPTH)
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+#else /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
     // Store the 1x4 output tile
     *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out00;
@@ -570,7 +600,11 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
     // Get output address
+#if defined(SRC_DEPTH)
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
     offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
     int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
 
@@ -652,18 +686,28 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
 )
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute output address
     int y_in  = get_global_id(1);
     int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
     int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
     int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
+#if defined(SRC_DEPTH)
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the input tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
@@ -705,14 +749,14 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE d10                                                           = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d11                                                           = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d12                                                           = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d13                                                           = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-    DATA_TYPE d14                                                           = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d15                                                           = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d16                                                           = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d17                                                           = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
 
     DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
     DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
@@ -861,15 +905,21 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     int dst_size)
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
     const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     int y_in  = get_global_id(1);
     int x_out = get_global_id(0);
     int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
     int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the input tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
@@ -903,8 +953,12 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Get output address
+#if defined(SRC_DEPTH)
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+#else                                                                               /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif                                                                              /* defined(SRC_DEPTH) */
+    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
     *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out00;
     *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out01;
@@ -1031,7 +1085,11 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     out_col3 += (VEC_DATA_TYPE(DATA_TYPE, 4))b;
 #endif // defined(HAS_BIAS)
     // Get output address
+#if defined(SRC_DEPTH)
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
     offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
     int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
 
@@ -1730,4 +1788,4 @@ __kernel void winograd_output_transform_1x4_1x5_nhwc(
                                            dst_size);
 }
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#endif // defined(SRC_DEPTH) && defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index f76ade1d32..1c31ceba99 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -155,6 +155,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
+    const size_t total_batches = input->info()->tensor_shape().total_size_upper(3);
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
@@ -167,13 +168,13 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
     if(data_layout == DataLayout::NHWC)
     {
-        build_opts.add_option("-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
+        build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
         build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
         build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
     }
     else
     {
-        build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
+        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
     }
 
     // Create kernel
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index dc0a0e7f8f..7f1afe0058 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -157,6 +157,7 @@ void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const IC
                                                                 kernel_size,
                                                                 output_tile_size,
                                                                 conv_info);
+    const size_t total_batches = output->info()->tensor_shape().total_size_upper(3);
 
     // Set build options
     CLBuildOptions build_opts;
@@ -165,7 +166,7 @@ void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const IC
     build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
     build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
+    build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");