From 149fdf3cad6b42ed302ebe2b0d614a36b9b4d81c Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Wed, 4 Jul 2018 17:03:33 +0100
Subject: COMPMID-1337 Implementing Winograd Convolution Layer 1x3 and 3x1
 kernels on OpenCL NHWC

Change-Id: Ia07e0dfcbcd07366c4bcb956e298369fb12a0369
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/138759
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 src/core/CL/CLHelpers.cpp                          |  10 +-
 src/core/CL/CLKernelLibrary.cpp                    |   6 +
 .../CL/cl_kernels/winograd_filter_transform.cl     | 173 ++++++++++++++----
 src/core/CL/cl_kernels/winograd_input_transform.cl | 196 ++++++++++++++++++---
 .../CL/cl_kernels/winograd_output_transform.cl     | 183 +++++++++++++++++--
 .../CL/kernels/CLWinogradInputTransformKernel.cpp  |  34 ++--
 tests/datasets/WinogradOutputTransformDataset.h    |  31 +++-
 tests/validation/CL/Winograd.cpp                   |   8 +-
 tests/validation/reference/Winograd.cpp            |  14 --
 9 files changed, 537 insertions(+), 118 deletions(-)

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index cd60c6e446..3965be76fd 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -153,7 +153,7 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
 
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> winograd_filter_transform_nchw =
+    std::vector<WinogradConfiguration> winograd_configs_nchw =
     {
         WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
@@ -166,9 +166,11 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
     };
 
-    std::vector<WinogradConfiguration> winograd_filter_transform_nhwc =
+    std::vector<WinogradConfiguration> winograd_configs_nhwc =
     {
         WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
+        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
+        WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
     };
@@ -179,11 +181,11 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
     // Return true if supported
     if(data_layout == DataLayout::NCHW)
     {
-        return (std::find(winograd_filter_transform_nchw.begin(), winograd_filter_transform_nchw.end(), p) != winograd_filter_transform_nchw.end());
+        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
     }
     else
     {
-        return (std::find(winograd_filter_transform_nhwc.begin(), winograd_filter_transform_nhwc.end(), p) != winograd_filter_transform_nhwc.end());
+        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
     }
 }
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 7f26b04741..03731ee93c 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -376,6 +376,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "winograd_filter_transform_4x4_5x5_nchw", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_4x1_5x1_nchw", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_1x4_1x5_nchw", "winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x1_3x1_nhwc", "winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x4_1x3_nhwc", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_4x4_3x3_nhwc", "winograd_filter_transform.cl" },
     { "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" },
     { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" },
@@ -390,6 +392,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd_input_transform.cl" },
     { "winograd_input_transform_4x1_5x1_stepz1_nchw", "winograd_input_transform.cl" },
     { "winograd_input_transform_1x4_1x5_stepz1_nchw", "winograd_input_transform.cl" },
+    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "winograd_input_transform.cl" },
+    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "winograd_input_transform.cl" },
     { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "winograd_input_transform.cl" },
     { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" },
     { "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" },
@@ -401,6 +405,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "winograd_output_transform_4x4_5x5_nchw", "winograd_output_transform.cl" },
     { "winograd_output_transform_4x1_5x1_nchw", "winograd_output_transform.cl" },
     { "winograd_output_transform_1x4_1x5_nchw", "winograd_output_transform.cl" },
+    { "winograd_output_transform_4x1_3x1_nhwc", "winograd_output_transform.cl" },
+    { "winograd_output_transform_1x4_1x3_nhwc", "winograd_output_transform.cl" },
     { "winograd_output_transform_4x4_3x3_nhwc", "winograd_output_transform.cl" },
     { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
     { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
index 5f528d4b0e..e53da9b278 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_filter_transform.cl
@@ -285,9 +285,11 @@ __kernel void winograd_filter_transform_4x4_3x3_nchw(
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 
-/** This OpenCL kernel performs Winograd filter transform 3x3 when the data layout is NHWC and the output tile is 4x4
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -317,32 +319,26 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
 
     // Load the values from the input tensor
-    float w00 = *((__global float *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
-    float w01 = *((__global float *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
-    float w02 = *((__global float *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
-    float w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    float w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    float w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    float w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    float w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    float w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-
-    // Transform the 3x3 tile in a 6x6 tile
-    float out00, out01, out02, out03, out04, out05;
-    float out10, out11, out12, out13, out14, out15;
-    float out20, out21, out22, out23, out24, out25;
-    float out30, out31, out32, out33, out34, out35;
-    float out40, out41, out42, out43, out44, out45;
-    float out50, out51, out52, out53, out54, out55;
-
-    out00 = out01 = out02 = out03 = out04 = out05 = 0.f;
-    out10 = out11 = out12 = out13 = out14 = out15 = 0.f;
-    out20 = out21 = out22 = out23 = out24 = out25 = 0.f;
-    out30 = out31 = out32 = out33 = out34 = out35 = 0.f;
-    out40 = out41 = out42 = out43 = out44 = out45 = 0.f;
-    out50 = out51 = out52 = out53 = out54 = out55 = 0.f;
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    float w00 = *((__global float *)(src_addr + 0 * src_stride_z));
+    float w01 = *((__global float *)(src_addr + 1 * src_stride_z));
+    float w02 = *((__global float *)(src_addr + 2 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    float  w00 = *((__global float *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+    float  w01 = *((__global float *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+    float  w02 = *((__global float *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    float  w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    float  w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    float  w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    float  w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    float  w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    float  w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     // Row 0
+    float out00, out01, out02, out03, out04, out05;
     out00 = (w00) / 16.f;
     out01 = (-w00 - w01 - w02) / 24.f;
     out02 = (-w00 + w01 - w02) / 24.f;
@@ -350,7 +346,9 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
     out05 = (w02) / 4.f;
 
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Row 1
+    float out10, out11, out12, out13, out14, out15;
     out10 = (-w00 - w10 - w20) / 24.f;
     out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
     out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
@@ -359,6 +357,7 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     out15 = (-w02 - w12 - w22) / 6.f;
 
     // Row 2
+    float out20, out21, out22, out23, out24, out25;
     out20 = (-w00 + w10 - w20) / 24.f;
     out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
     out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
@@ -367,6 +366,7 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     out25 = (-w02 + w12 - w22) / 6.f;
 
     // Row 3
+    float out30, out31, out32, out33, out34, out35;
     out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
     out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
     out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
@@ -375,6 +375,7 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
 
     // Row 4
+    float out40, out41, out42, out43, out44, out45;
     out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
     out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
     out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
@@ -383,26 +384,31 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
 
     // Row 5
+    float out50, out51, out52, out53, out54, out55;
     out50 = (w20) / 4.f;
     out51 = (-w20 - w21 - w22) / 6.f;
     out52 = (-w20 + w21 - w22) / 6.f;
     out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
     out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
     out55 = (w22);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     int x0 = get_global_id(2); // idx filter
     int y0 = get_global_id(0); // idx channel
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
 
     // Store the values across the channels
-    *(__global float *)(dst_addr + 0 * dst_stride_z)  = out00;
-    *(__global float *)(dst_addr + 1 * dst_stride_z)  = out01;
-    *(__global float *)(dst_addr + 2 * dst_stride_z)  = out02;
-    *(__global float *)(dst_addr + 3 * dst_stride_z)  = out03;
-    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out04;
-    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out05;
+    // 36 channels for 3x3 kernels
+    // 6  channels for 3x1 or 1x3 kernels
+    *(__global float *)(dst_addr + 0 * dst_stride_z) = out00;
+    *(__global float *)(dst_addr + 1 * dst_stride_z) = out01;
+    *(__global float *)(dst_addr + 2 * dst_stride_z) = out02;
+    *(__global float *)(dst_addr + 3 * dst_stride_z) = out03;
+    *(__global float *)(dst_addr + 4 * dst_stride_z) = out04;
+    *(__global float *)(dst_addr + 5 * dst_stride_z) = out05;
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     *(__global float *)(dst_addr + 6 * dst_stride_z)  = out10;
     *(__global float *)(dst_addr + 7 * dst_stride_z)  = out11;
     *(__global float *)(dst_addr + 8 * dst_stride_z)  = out12;
@@ -433,7 +439,108 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
     *(__global float *)(dst_addr + 33 * dst_stride_z) = out53;
     *(__global float *)(dst_addr + 34 * dst_stride_z) = out54;
     *(__global float *)(dst_addr + 35 * dst_stride_z) = out55;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
 }
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 /** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1264,4 +1371,4 @@ __kernel void winograd_filter_transform_1x4_1x5_nchw(
                                            dst_step_z,
                                            dst_offset_first_element_in_bytes);
 }
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
\ No newline at end of file
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index fe1c0b3c1d..01cbc84ff3 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -555,12 +555,16 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
 }
 
 #if defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, the filter size 3x3 and the data layout is NHWC
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
  * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
  * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
  *
  * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
@@ -587,20 +591,25 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     int y = get_global_id(1);
     int z = get_global_id(2);
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float);
 
     // Clamp coordinates. This clamp is valid for all rows
-    int4 y_coord0 = (int4)(y * 4) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
-    int2 y_coord1 = (int2)(y * 4) + (int2)(4, 5) - (int2)PAD_LEFT;
+    int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
+    int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
     y_coord0      = clamp(y_coord0, -1, SRC_DIM_1);
     y_coord1      = clamp(y_coord1, -1, SRC_DIM_1);
 
+    int  z_coord;
+    int4 valid_y0;
+    int2 valid_y1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row4
-    int z_coord = (z * 4) - PAD_TOP + 4;
+    z_coord = (z * 4) - PAD_TOP + 4;
 
     // If z < 0, set y to -1
-    int4 valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
-    int2 valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
+    valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
+    valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
     // If z >= SRC_DIM_2, set y to SRC_DIM_2
     valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
     valid_y1 = select(valid_y1, SRC_DIM_1, (int2)z_coord >= SRC_DIM_2);
@@ -628,9 +637,11 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     k3 += -2.0f * d41 + 2.0f * d43 - d42;
     k4 += 2.0f * d41 - 2.0f * d43 - d42;
     k5 += 4.0f * d41 - 5.0f * d43 + d45;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
+#if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row0
-    z_coord = (z * 4) - PAD_TOP + 0;
+    z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 0;
 
 #if PAD_TOP != 0
     valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
@@ -649,9 +660,36 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     float d03 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
     float d04 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
     float d05 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+#else  // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
+    int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
+
+    valid_y0 = select((int4)y_coord0.s0, (int4) - 1, z_coords0 < (int4)0);
+    valid_y1 = select((int2)y_coord0.s0, (int2) - 1, z_coords1 < (int2)0);
+    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, z_coords0 >= (int4)SRC_DIM_2);
+    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, z_coords1 >= (int2)SRC_DIM_2);
+
+    z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)(SRC_DIM_2 - 1));
+    z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)(SRC_DIM_2 - 1));
+
+    float d00 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
+    float d01 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
+    float d02 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
+    float d03 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
+    float d04 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
+    float d05 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    float out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
+    float out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
+    float out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
+    float out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
+    float out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
+    float out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
 
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row2
-    z_coord  = (z * 4) - PAD_TOP + 2;
+    z_coord  = (z * OUTPUT_TILE_H) - PAD_TOP + 2;
     valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
     valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
     valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
@@ -665,17 +703,12 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     float d24 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
     float d25 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
-    // Compute destination address
-    __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_x + (y + z * (int)NUM_TILES_X) * dst_stride_y);
-
-    uint dst_plane_stride = dst_stride_z / sizeof(float);
-
-    float out0  = k0;
-    float out1  = k1;
-    float out2  = k2;
-    float out3  = k3;
-    float out4  = k4;
-    float out5  = k5;
+    out0 += k0;
+    out1 += k1;
+    out2 += k2;
+    out3 += k3;
+    out4 += k4;
+    out5 += k5;
     float out6  = k0;
     float out7  = k1;
     float out8  = k2;
@@ -702,12 +735,17 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     float out29 = k5;
 
     // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
-    out0 += 16.0f * d00 - 20.0f * d02 - 20.0f * d20 + 25.0f * d22 + 4.0f * d04 - 5.0f * d24;
-    out1 += -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 20.0f * d21 + 20.0f * d22 - 5.0f * d23 + 4.0f * d04 - 5.0f * d24;
-    out2 += 16.0f * d01 - 16.0f * d02 - 4.0f * d03 - 20.0f * d21 + 20.0f * d22 + 5.0f * d23 + 4.0f * d04 - 5.0f * d24;
-    out3 += -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 10.0f * d21 + 5.0f * d22 - 10.0f * d23 + 4.0f * d04 - 5.0f * d24;
-    out4 += 8.0f * d01 - 4.0f * d02 - 8.0f * d03 - 10.0f * d21 + 5.0f * d22 + 10.0f * d23 + 4.0f * d04 - 5.0f * d24;
-    out5 += 16.0f * d01 - 20.0f * d03 - 20.0f * d21 + 4.0f * d05 + 25.0f * d23 - 5.0f * d25;
+    out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
+    out1 += 20.0f * d21 + 20.0f * d22 - 5.0f * d23 - 5.0f * d24;
+    out2 += -20.0f * d21 + 20.0f * d22 + 5.0f * d23 - 5.0f * d24;
+    out3 += 10.0f * d21 + 5.0f * d22 - 10.0f * d23 - 5.0f * d24;
+    out4 += -10.0f * d21 + 5.0f * d22 + 10.0f * d23 - 5.0f * d24;
+    out5 += -20.0f * d21 + 25.0f * d23 - 5.0f * d25;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Compute destination address
+    __global float *dst_addr         = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(float) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+    uint            dst_plane_stride = dst_stride_z / sizeof(float);
 
     *((__global float *)dst_addr) = out0;
     dst_addr += dst_plane_stride;
@@ -722,8 +760,9 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     *((__global float *)dst_addr) = out5;
     dst_addr += dst_plane_stride;
 
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row1
-    z_coord = (z * 4) - PAD_TOP + 1;
+    z_coord = (z * OUTPUT_TILE_H) - PAD_TOP + 1;
     // Row1 can never be out of bounds
     valid_y0 = y_coord0;
     valid_y1 = y_coord1;
@@ -736,7 +775,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     float d15 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row3
-    z_coord  = (z * 4) - PAD_TOP + 3;
+    z_coord  = (z * OUTPUT_TILE_H) - PAD_TOP + 3;
     valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
     valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
     valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
@@ -859,7 +898,7 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     dst_addr += dst_plane_stride;
 
     // Row5
-    z_coord  = (z * 4) - PAD_TOP + 5;
+    z_coord  = (z * OUTPUT_TILE_H) - PAD_TOP + 5;
     valid_y0 = select(y_coord0, -1, (int4)z_coord < 0);
     valid_y1 = select(y_coord1, -1, (int2)z_coord < 0);
     valid_y0 = select(valid_y0, SRC_DIM_1, (int4)z_coord >= SRC_DIM_2);
@@ -894,7 +933,106 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     dst_addr += dst_plane_stride;
     *((__global float *)dst_addr) = out5;
     dst_addr += dst_plane_stride;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes);
 }
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
 #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
 
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index c63b206080..61f0f61db7 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -351,9 +351,13 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NHWC
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
  *
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -381,12 +385,12 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
 #endif // defined(HAS_BIAS)
     int dst_size)
 {
-    // Each thread stores a 4x4 tile
+    // Each thread stores a 4x4/4x1 or 1x4 tile
     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
 
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
 
-    // Load the values across the 36 channels to compose the 6x6 tile
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
     float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
     float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
     float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
@@ -394,6 +398,14 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
     float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
 
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
     float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
     float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
     float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
@@ -484,11 +496,12 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
     out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
     out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
     int y_in  = get_global_id(1);
     int x_out = get_global_id(0);
-    int y_out = (y_in % NUM_TILES_X) * 4;
-    int z_out = (y_in / NUM_TILES_X) * 4;
+    int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
 
 #if defined(HAS_BIAS)
     // Add bias
@@ -500,7 +513,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     out01 += (float)b;
     out02 += (float)b;
     out03 += (float)b;
-
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     out10 += (float)b;
     out11 += (float)b;
     out12 += (float)b;
@@ -515,9 +528,29 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     out31 += (float)b;
     out32 += (float)b;
     out33 += (float)b;
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #endif // defined(HAS_BIAS)
 
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
+    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    // Store the 1x4 output tile
+    *((__global float *)(dst_ptr + offset.s0)) = out00;
+    *((__global float *)(dst_ptr + offset.s1)) = out01;
+    *((__global float *)(dst_ptr + offset.s2)) = out02;
+    *((__global float *)(dst_ptr + offset.s3)) = out03;
+#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+    // Store the 4x1 output tile
+    int  offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+    int4 mult_y = min(dst_size - offset, 1);
+
+    *((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset)) = out00;
+    *((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset)) = out01;
+    *((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset)) = out02;
+    *((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset)) = out03;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
     // Get output address
     int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
     offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
@@ -540,7 +573,127 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
     *((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
     *((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
     *((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
+
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+}
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
 }
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
     ({                                                                   \
@@ -646,14 +799,14 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
-    float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
-    float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
-    float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
-    float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
-    float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
-    float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
-    float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
+    float d10                                                              = *((__global float *)(src_addr + 8 * src_stride_z));
+    float d11                                                              = *((__global float *)(src_addr + 9 * src_stride_z));
+    float d12                                                              = *((__global float *)(src_addr + 10 * src_stride_z));
+    float d13                                                              = *((__global float *)(src_addr + 11 * src_stride_z));
+    float d14                                                              = *((__global float *)(src_addr + 12 * src_stride_z));
+    float d15                                                              = *((__global float *)(src_addr + 13 * src_stride_z));
+    float d16                                                              = *((__global float *)(src_addr + 14 * src_stride_z));
+    float d17                                                              = *((__global float *)(src_addr + 15 * src_stride_z));
 
     float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
     float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
@@ -1290,4 +1443,4 @@ __kernel void winograd_output_transform_1x4_1x5_nchw(
                                           );
 }
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
\ No newline at end of file
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index bb484afafb..fcfd9e30a1 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -68,35 +69,28 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 {
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
 
-    unsigned int num_elems_read_per_iteration_x = 0;
-    unsigned int num_elems_read_per_iteration_y = 0;
-    unsigned int pad_left                       = 0;
-    unsigned int pad_top                        = 0;
+    bool   window_changed = false;
+    Window win            = calculate_max_window(*input, Steps(1, 1));
 
     if(input->data_layout() == DataLayout::NCHW)
     {
-        num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
-        num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
-        pad_left                       = conv_info.pad_left();
-        pad_top                        = conv_info.pad_top();
+        const PadStrideInfo conv_info        = winograd_info.convolution_info;
+        const Size2D        output_tile_size = winograd_info.output_tile_size;
+        const Size2D        kernel_size      = winograd_info.kernel_size;
+
+        unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
+        unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
+
+        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+        window_changed = update_window_and_padding(win, input_access);
     }
     else
     {
-        num_elems_read_per_iteration_x = 1;
-        num_elems_read_per_iteration_y = output_tile_size.width + kernel_size.width - 1;
-        pad_top                        = 1;
+        AccessWindowStatic input_access(input, 0, -1, input->dimension(0), input->dimension(1) + 1);
+        window_changed = update_window_and_padding(win, input_access);
     }
 
-    Window win = calculate_max_window(*input, Steps(1, 1));
-
-    AccessWindowRectangle input_access(input, -pad_left, -pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
-
-    bool window_changed = update_window_and_padding(win, input_access);
-
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
index fc23e65258..4085e91854 100644
--- a/tests/datasets/WinogradOutputTransformDataset.h
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -178,6 +178,20 @@ class SmallWinogradOutputTransformDatasetNHWC final : public WinogradOutputTrans
 public:
     SmallWinogradOutputTransformDatasetNHWC()
     {
+        // (4x1, 3x1)
+        add_config(TensorShape(13U, 12U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(7U, 22U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(1U, 462U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(53U, 33U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+        add_config(TensorShape(7U, 20U, 6U, 3U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(24U, 56U, 6U, 2U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+
+        // (1x4, 1x3)
+        add_config(TensorShape(13U, 7U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(7U, 30U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(1U, 477U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+        add_config(TensorShape(7U, 16U, 6U, 3U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(24U, 56U, 6U, 2U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+
         // (4x4, 3x3)
         add_config(TensorShape(13U, 4U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(10U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
         add_config(TensorShape(13U, 6U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
@@ -200,7 +214,6 @@ class LargeWinogradOutputTransformDatasetNCHW final : public WinogradOutputTrans
 public:
     LargeWinogradOutputTransformDatasetNCHW()
     {
-        // NCHW
         // (2x2, 3x3)
         add_config(TensorShape(64U, 12544U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
         add_config(TensorShape(32U, 3080U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW));
@@ -274,6 +287,22 @@ class LargeWinogradOutputTransformDatasetNHWC final : public WinogradOutputTrans
 public:
     LargeWinogradOutputTransformDatasetNHWC()
     {
+        // (4x1, 3x1)
+        add_config(TensorShape(64U, 12488U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+        add_config(TensorShape(32U, 3080U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(112U, 110U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+        add_config(TensorShape(13U, 784U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+        add_config(TensorShape(64U, 12488U, 6U, 3U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(32U, 3080U, 6U, 2U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(13U, 784U, 6U, 5U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+
+        // (1x4, 1x3)
+        add_config(TensorShape(64U, 12544U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+        add_config(TensorShape(32U, 3136U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+        add_config(TensorShape(13U, 784U, 6U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+        add_config(TensorShape(64U, 12544U, 6U, 3U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(224U, 223U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(32U, 3024U, 6U, 2U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(112U, 110U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+        add_config(TensorShape(13U, 784U, 6U, 5U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+
         // (4x4, 3x3)
         add_config(TensorShape(64U, 3136U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NHWC));
         add_config(TensorShape(32U, 784U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 849d0c13bc..c39cb4e790 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -70,7 +70,9 @@ const auto SmallWinogradInputTransformDatasetNCHW =
                                       datasets::SmallWinogradInputTransformDataset1x4_1x5()))))))));
 
 const auto SmallWinogradInputTransformDatasetNHWC = framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x4_3x3(),
-                                                                               datasets::SmallWinogradInputTransformDataset4x4_5x5());
+                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset4x1_3x1(),
+                                                    framework::dataset::concat(datasets::SmallWinogradInputTransformDataset1x4_1x3(),
+                                                                               datasets::SmallWinogradInputTransformDataset4x4_5x5())));
 
 const auto LargeWinogradInputTransformDatasetNCHW =
            framework::dataset::concat(datasets::LargeWinogradInputTransformDataset2x2_3x3(),
@@ -98,7 +100,9 @@ const auto SmallWinogradFilterTransformDatasetNCHW =
 
 const auto SmallWinogradFilterTransformDatasetNHWC =
            framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })),
-                                      combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })));
+           framework::dataset::concat(combine(datasets::Small3x1Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 1U) })),
+           framework::dataset::concat(combine(datasets::Small1x3Shapes(), framework::dataset::make("OutputTile", { Size2D(1U, 4U) })),
+                                      combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", { Size2D(4U, 4U) })))));
 
 const auto LargeWinogradFilterTransformDatasetNCHW =
            framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })),
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 026b30031c..132d252383 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -193,20 +193,6 @@ void initialize_matrix_transform(SimpleTensor<T> &src, const Size2D &output_tile
 }
 } // namespace
 
-template <typename T>
-void print_tile(SimpleTensor<T> &in)
-{
-    for(int y = 0; y < in.shape()[1]; y++)
-    {
-        for(int x = 0; x < in.shape()[0]; x++)
-        {
-            std::cout << in[x + y * in.shape()[0]] << " ";
-        }
-
-        std::cout << std::endl;
-    }
-}
-
 template <typename T>
 SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
 {
-- 
cgit v1.2.1