From d2fab7315bac3a586f2f1b1c8d64f2441f89ca64 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 2 Mar 2018 11:18:12 +0000
Subject: COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part
 4)

Implemented Winograd Output Transform (2x2,3x3) on OpenCL
Implemented CLWinogradConvolutionLayer on OpenCL

Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/CLKernels.h                    |   1 +
 .../core/CL/kernels/CLGEMMMatrixMultiplyKernel.h   |   1 +
 .../CL/kernels/CLWinogradOutputTransformKernel.h   |  81 +++++++
 arm_compute/core/utils/misc/ShapeCalculator.h      |  40 +++-
 arm_compute/runtime/CL/CLFunctions.h               |   1 +
 .../CL/functions/CLWinogradConvolutionLayer.h      |  97 ++++++++
 src/core/CL/CLKernelLibrary.cpp                    |   1 +
 src/core/CL/cl_kernels/gemm.cl                     | 127 +++++++++--
 src/core/CL/cl_kernels/winograd.cl                 | 247 +++++++++++++++------
 src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp |  21 +-
 src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp   |   7 +-
 .../CL/kernels/CLWinogradFilterTransformKernel.cpp |   2 +-
 .../CL/kernels/CLWinogradInputTransformKernel.cpp  |   7 +-
 .../CL/kernels/CLWinogradOutputTransformKernel.cpp | 188 ++++++++++++++++
 src/runtime/CL/functions/CLGEMM.cpp                |  14 +-
 .../CL/functions/CLWinogradConvolutionLayer.cpp    | 146 ++++++++++++
 .../CL/functions/CLWinogradInputTransform.cpp      |   2 +-
 tests/datasets/LargeConvolutionLayerDataset.h      |  24 +-
 tests/datasets/SmallConvolutionLayerDataset.h      |  12 +-
 tests/datasets/WinogradOutputTransformDataset.h    | 153 +++++++++++++
 tests/validation/CL/Winograd.cpp                   | 179 ++++++++++++++-
 tests/validation/NEON/ConvolutionLayer.cpp         |   6 +-
 tests/validation/fixtures/WinogradLayerFixture.h   | 120 ++++++++--
 tests/validation/reference/ConvolutionLayer.cpp    |   2 +-
 tests/validation/reference/Winograd.cpp            | 218 ++++++++++++++----
 tests/validation/reference/Winograd.h              |   3 +
 26 files changed, 1506 insertions(+), 194 deletions(-)
 create mode 100644 arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
 create mode 100644 arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
 create mode 100644 src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
 create mode 100644 src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
 create mode 100644 tests/datasets/WinogradOutputTransformDataset.h

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index ef629c2e81..6f5c61523f 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -111,5 +111,6 @@
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 #include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 
 #endif /* __ARM_COMPUTE_CLKERNELS_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 7260c4a4f6..ee7e7c0e97 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -84,6 +84,7 @@ private:
     const ICLTensor *_input0;
     const ICLTensor *_input1;
     ICLTensor       *_output;
+    bool             _slide_matrix_b;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
new file mode 100644
index 0000000000..35117c65db
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__
+#define __ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Winograd output transform kernel. */
+class CLWinogradOutputTransformKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLWinogradOutputTransformKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
+    /** Default destructor */
+    ~CLWinogradOutputTransformKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input                 Source tensor with shape [C, N, 16, batches]. Data types supported: F32.
+     * @param[in]  bias                  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+     * @param[out] output                Destination tensor with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
+     * @param[in]  kernel_dims           Kernel dimensions (Width and height). Currently only supported 3x3 kernels
+     * @param[in]  output_convolved_dims Output dimensions after the convolution (Width and height)
+     * @param[in]  num_tiles             Number of tiles of size 2x2 in the output tensor along the X and Y direction
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
+     *
+     * @param[in]  input                 Source tensor with shape [C, N, 16, batches]. Data types supported: F32.
+     * @param[in]  bias                  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+     * @param[out] output                Destination tensor with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
+     * @param[in]  kernel_dims           Kernel dimensions (Width and height). Currently only supported 3x3 kernels
+     * @param[in]  output_convolved_dims Output dimensions after the convolution (Width and height)
+     * @param[in]  num_tiles             Number of tiles of size 2x2 in the output tensor along the X and Y direction
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 1e90927a93..5344ce7e74 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -28,6 +28,8 @@
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Utils.h"
 
+#include <cmath>
+
 namespace arm_compute
 {
 namespace misc
@@ -233,19 +235,45 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp
 
     return output_shape;
 }
+
+inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &input, const Size2D &output_convolved_dims, DataLayout data_layout)
+{
+    TensorShape tensor_shape{ input.tensor_shape() };
+
+    // Output dimension
+    const unsigned int out_w = output_convolved_dims.width;
+    const unsigned int out_h = output_convolved_dims.height;
+    const unsigned int out_c = input.dimension(0);
+
+    tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH), out_w);
+    tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT), out_h);
+    tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL), out_c);
+
+    return tensor_shape;
+}
+
 inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
 {
     const TensorShape input_shape{ input.tensor_shape() };
     const TensorShape weights_shape{ weights.tensor_shape() };
 
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+    const size_t idx_width   = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
+
+    const unsigned int input_width     = input_shape[idx_width];
+    const unsigned int input_height    = input_shape[idx_height];
+    const unsigned int weights_width   = weights_shape[idx_width];
+    const unsigned int weights_height  = weights_shape[idx_height];
+    const unsigned int weights_channel = weights_shape[idx_channel];
+    unsigned int       output_width    = 0;
+    unsigned int       output_height   = 0;
+    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
 
     TensorShape output_shape{ input_shape };
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-    output_shape.set(2, weights_shape[3]);
+    output_shape.set(idx_width, output_width);
+    output_shape.set(idx_height, output_height);
+    output_shape.set(idx_channel, weights_channel);
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 7c2377aa09..adf240e642 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -107,6 +107,7 @@
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 #include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
 #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 
 #endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
new file mode 100644
index 0000000000..14de169236
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref CLWinogradInputTransform
+ *  -# @ref CLWinogradFilterTransformKernel (only once)
+ *  -# @ref CLGEMM
+ *  -# @ref CLWinogradOutputTransformKernel
+ *
+ */
+class CLWinogradConvolutionLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Set the input and output tensors.
+     *
+     * @note: This function only works with 3x3 kernels and unit strides
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
+     *
+     * @note: This function only works with 3x3 kernels and unit strides
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMemoryGroup                   _memory_group;
+    CLGEMM                          _batched_mm;
+    CLWinogradInputTransform        _input_transform;
+    CLWinogradFilterTransformKernel _filter_transform;
+    CLWinogradOutputTransformKernel _output_transform;
+    CLTensor                        _input0;
+    CLTensor                        _input1;
+    CLTensor                        _batched_mm_output;
+    bool                            _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__ */
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 4b7fa8a3b3..9df2dcbacd 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -354,6 +354,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" },
     { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" },
     { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" },
+    { "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" },
     { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
     { "YUYV422_to_NV12_bt709", "color_convert.cl" },
     { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index cba5eea437..a5b0acbe9c 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -162,6 +162,8 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -199,8 +201,18 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global float *src_addr_a = (__global float *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global float *src_end_addr_b = src_addr_b + COLS_B;
@@ -277,6 +289,9 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -314,8 +329,18 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global float *src_addr_a = (__global float *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global float *src_end_addr_b = src_addr_b + COLS_B;
@@ -510,6 +535,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -547,8 +574,18 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global half *src_addr_a = (__global half *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global half *src_end_addr_b = src_addr_b + COLS_B;
@@ -627,8 +664,9 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 8 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 8 bit fixed point format
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -666,8 +704,18 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global char *src_addr_a = src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    __global char *src_addr_b = src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes;
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);
+    __global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global char *src_end_addr_b = src_addr_b + COLS_B;
@@ -738,8 +786,9 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 16 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 16 bit fixed point format
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -777,8 +826,18 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global short *src_addr_a = (__global short *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global short *src_addr_b = (__global short *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);
+    __global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global short *src_end_addr_b = src_addr_b + COLS_B;
@@ -845,6 +904,8 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
  * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
  * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -885,7 +946,13 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
 
     // Add offset for batched GEMM
     src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
     src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
 
@@ -1013,6 +1080,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
  * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
  * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1054,8 +1123,12 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
     // Add offset for batched GEMM
     src_addr.s0 += get_global_id(2) * src0_stride_z;
 
-    // For convolution layer we do not want to slide the matrix B along Z
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
     src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     // Address boundary for matrix A
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
@@ -1251,6 +1324,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
  * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
  * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1293,8 +1368,12 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
     // Add offset for batched GEMM
     src_addr.s0 += get_global_id(2) * src0_stride_z;
 
-    // For convolution layer we do not want to slide the matrix B along Z
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
     src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     // Address boundary for the matrix A
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
@@ -1460,6 +1539,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
  * @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
  * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
  * @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1500,7 +1581,13 @@ __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
 
     // Add offset for batched GEMM
     src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
     src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
 
@@ -1636,6 +1723,8 @@ __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
  * @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
  * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
  * @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1676,7 +1765,13 @@ __kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
 
     // Add offset for batched GEMM
     src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
     src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
 
diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
index 238e21a18a..25c129d0aa 100644
--- a/src/core/CL/cl_kernels/winograd.cl
+++ b/src/core/CL/cl_kernels/winograd.cl
@@ -23,8 +23,102 @@
  */
 #include "helpers.h"
 
-#if defined(NUM_TILES_X)
+#if defined(NUM_CHANNELS)
+
+/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+    float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+    // Transform the 3x3 tile in a 4x4 tile
+    float4 out0 = 0.0f;
+    float4 out1 = 0.0f;
+    float4 out2 = 0.0f;
+    float4 out3 = 0.0f;
+
+    // Row 0
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+    // Row 1
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
 
+    int z  = get_global_id(2);
+    int x0 = z / NUM_CHANNELS; // idx filter
+    int y0 = z % NUM_CHANNELS; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the 16 values across the 16 channels
+    *(__global float *)(dst_addr + 0 * dst_stride_z)  = out0.s0;
+    *(__global float *)(dst_addr + 1 * dst_stride_z)  = out0.s1;
+    *(__global float *)(dst_addr + 2 * dst_stride_z)  = out0.s2;
+    *(__global float *)(dst_addr + 3 * dst_stride_z)  = out0.s3;
+    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+}
+#endif // defined(NUM_CHANNELS)
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -205,13 +299,12 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
     vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z));
     vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
 }
-#endif //defined(NUM_TILES_X)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
 
-#if defined(NUM_CHANNELS)
-
-/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2
+#if defined(NUM_TILES_X)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2, the filter size 3x3 and the data format is NCHW
  *
- * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -220,8 +313,6 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -232,72 +323,84 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
 {
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-    float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
-
-    // Transform the 3x3 tile in a 4x4 tile
-    float4 out0 = 0.0f;
-    float4 out1 = 0.0f;
-    float4 out2 = 0.0f;
-    float4 out3 = 0.0f;
-
-    // Row 0
-    out0.s0 = (w0.s0);
-    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
-    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
-    out0.s3 = (w0.s2);
-
-    // Row 1
-    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
-    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
-    // Row 2
-    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
-    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
-    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
-    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
-    // Row 3
-    out3.s0 = (w2.s0);
-    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
-    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
-    out3.s3 = (w2.s2);
-
-    int z  = get_global_id(2);
-    int x0 = z / NUM_CHANNELS; // idx filter
-    int y0 = z % NUM_CHANNELS; // idx channel
+    // Each thread stores a 2x2 tile
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+    // Load the values across the 16 channels to compose the 4x4 tile
+    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+
+    float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
+    float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
+    float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
+    float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
+
+    float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
+    float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
+    float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
+    float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+    float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
+    float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
+    float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
+    float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
+
+    // Compute the 2x2 output tile
+    float k0 = d01 + d11 + d21;
+    float k1 = d02 + d12 + d22;
+    float k2 = d11 - d21 - d31;
+    float k3 = d12 - d22 - d32;
+
+    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+    float out00 = d10;
+    float out01 = -d13;
+    float out10 = d10;
+    float out11 = -d13;
+
+    out00 += d00 + d20 + k0 + k1;
+    out01 += k0 - k1 - (d03 + d23);
+    out10 += -d20 - d30 + k2 + k3;
+    out11 += k2 - k3 + d23 + d33;
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * 2;
+    int y_out = (y_in / NUM_TILES_X) * 2;
+    int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out10 += (float)b;
+    out11 += (float)b;
+#endif // defined(HAS_BIAS)
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
 
-    // Store the 16 values across the 16 channels
-    *(__global float *)(dst_addr + 0 * dst_stride_z)  = out0.s0;
-    *(__global float *)(dst_addr + 1 * dst_stride_z)  = out0.s1;
-    *(__global float *)(dst_addr + 2 * dst_stride_z)  = out0.s2;
-    *(__global float *)(dst_addr + 3 * dst_stride_z)  = out0.s3;
-    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
-    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
-    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
-    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
-    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
-    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
-    *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
-    *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
-    *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
-    *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
-    *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
-    *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+    // Store the 2x2 output tile
+    vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
 }
-#endif // defined(NUM_CHANNELS)
+#endif // defined(NUM_TILES_X)
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 9c69800928..7b785bb8da 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -55,6 +55,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
 
     if(!is_interleaved_transposed)
     {
@@ -174,7 +175,7 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
 } // namespace
 
 CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
 {
 }
 
@@ -192,9 +193,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
 
     const DataType data_type = input0->info()->data_type();
     const int      fp_pos    = input0->info()->fixed_point_position();
@@ -257,6 +259,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
                                       "-DALPHA=" + float_to_string_with_full_precision(alpha));
     }
 
+    // Do not slide matrix B if _slide_matrix_b = false
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
@@ -365,7 +370,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() < 3)
+        if(!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -374,9 +379,9 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[3]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[3]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 5489fde818..f69a39e4ad 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -76,15 +76,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     }
 
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, input_access);
 
     // Configure window in case of configured output
     if(output->total_size() != 0)
     {
         AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
+        window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
     }
+    else
+    {
+        window_changed = window_changed || update_window_and_padding(win, input_access);
+    }
 
     // Collapse along the Z direction
     Window collapsed = win.collapse(win, Window::DimZ);
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 3dbbe157b2..655b82bf66 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -76,7 +76,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
     AccessWindowStatic    output_access(output, 0, 0, output->dimension(0), output->dimension(1));
     window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
 
     Window win_collapsed = win.collapse(win, Window::DimZ);
 
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 72adb5f358..3b9350f9ba 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -44,11 +44,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_dims.width != 3 || kernel_dims.height != 3, "Winograd input transform only supports 3x3 kernels");
     ARM_COMPUTE_UNUSED(kernel_dims);
 
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, Size2D(3U, 3U));
-
     // Validate configured output
     if(output->total_size() != 0)
     {
+        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, kernel_dims);
+
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -151,7 +151,8 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
 Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &conv_info, const Size2D &kernel_dims)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, conv_info, kernel_dims));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_info, kernel_dims));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), conv_info, kernel_dims).first);
 
     return Status{};
 }
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
new file mode 100644
index 0000000000..c9823275eb
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_dims.width != 3 || kernel_dims.height != 3, "Only 3x3 kernels are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<unsigned int>(std::sqrt(input->dimension(2))) != 4, "Only 2x2 output tile is supported");
+    ARM_COMPUTE_UNUSED(kernel_dims);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, output_convolved_dims, DataLayout::NCHW));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowStatic    output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), 2), ceil_to_multiple(output->dimension(1), 2));
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+        window_changed = update_window_and_padding(win, input_access, bias_access, output_access);
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, output_access);
+    }
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims,
+                                                const Size2D &num_tiles)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(kernel_dims);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), output_convolved_dims, DataLayout::NCHW)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), kernel_dims, output_convolved_dims, num_tiles));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
+    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("winograd_output_transform_2x2_3x3_nchw", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "winograd_output_transform_2x2_3x3";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims,
+                                                 const Size2D &num_tiles)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, kernel_dims, output_convolved_dims, num_tiles));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup output slice
+    Window slice_out(slice);
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    if(_bias != nullptr)
+    {
+        unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, _bias, slice_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index a06d94c1f5..172facfa78 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -66,17 +66,21 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLT
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
     if(c != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B");
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000000..5081cbac4e
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _is_first_run(true)
+{
+}
+
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    // TODO(COMPMID-1013): This part will be removed
+    // Get indeces for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Kernel size
+    const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width];
+    const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height];
+
+    // Number of tiles along the X and Y direction
+    const unsigned int num_tiles_x = std::ceil((input->info()->tensor_shape().x() - (kernel_w - 1) + conv_info.pad_left() + conv_info.pad_right()) / 2.f);
+    const unsigned int num_tiles_y = std::ceil((input->info()->tensor_shape().y() - (kernel_h - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / 2.f);
+
+    // Compute output shape
+    const TensorShape output_convolved_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
+
+    // Manage intermediate tensors
+    _memory_group.manage(&_input0);
+    _memory_group.manage(&_batched_mm_output);
+
+    // Do not manage _input1 as it contains the weights
+
+    // Configure input transform
+    _input_transform.configure(input, &_input0, conv_info, Size2D(kernel_w, kernel_h));
+
+    // Configure filter transform
+    _filter_transform.configure(weights, &_input1);
+
+    // Configure batched matrix multiply
+    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+    // Configure output transform
+    _output_transform.configure(&_batched_mm_output, biases, output, Size2D(kernel_w, kernel_h), Size2D(output_convolved_shape[idx_width], output_convolved_shape[idx_height]), Size2D(num_tiles_x,
+                                num_tiles_y));
+
+    // Allocate temporary tensors
+    _input0.allocator()->allocate();
+    _input1.allocator()->allocate();
+    _batched_mm_output.allocator()->allocate();
+}
+
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    // TODO(COMPMID-1013): This part will be removed
+    // Get indeces for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Kernel size
+    const unsigned int kernel_w = weights->tensor_shape()[idx_width];
+    const unsigned int kernel_h = weights->tensor_shape()[idx_height];
+
+    // Number of tiles along the X and Y direction
+    const unsigned int num_tiles_x = std::ceil((input->tensor_shape().x() - (kernel_w - 1) + conv_info.pad_left() + conv_info.pad_right()) / 2.f);
+    const unsigned int num_tiles_y = std::ceil((input->tensor_shape().y() - (kernel_h - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / 2.f);
+
+    // Compute output shape
+    const TensorShape output_convolved_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
+
+    // Validate input transform
+    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, Size2D(kernel_w, kernel_h));
+    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, conv_info, Size2D(kernel_w, kernel_h)));
+
+    // Validate filter transform
+    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights);
+    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1));
+
+    // Configure batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+
+    // Configure output transform
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, Size2D(kernel_w, kernel_h), Size2D(output_convolved_shape[idx_width],
+                                                                          output_convolved_shape[idx_height]),
+                                                                          Size2D(num_tiles_x, num_tiles_y)));
+
+    return Status{};
+}
+
+void CLWinogradConvolutionLayer::run()
+{
+    if(_is_first_run)
+    {
+        // Run filter transform
+        CLScheduler::get().enqueue(_filter_transform, false);
+
+        _is_first_run = false;
+    }
+
+    _memory_group.acquire();
+
+    // Run input transform
+    _input_transform.run();
+
+    // Run batched matrix multiplication
+    _batched_mm.run();
+
+    // Run output transform
+    CLScheduler::get().enqueue(_output_transform);
+
+    _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 652f31ac74..0499d4cd2f 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -40,6 +40,6 @@ void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, co
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &conv_info, const Size2D &kernel_dims)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(CLWinogradInputTransformKernel::validate(input, output, conv_info, kernel_dims));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, conv_info, kernel_dims));
     return Status{};
 }
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 086b2e3def..ec8e09fa81 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,28 @@ namespace test
 {
 namespace datasets
 {
+class LargeWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer3x3Dataset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
+        // Batch size 3, 2 and 4
+        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
+
 class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset
 {
 public:
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index adb61de8e2..696c396eef 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -37,10 +37,10 @@ namespace test
 {
 namespace datasets
 {
-class SmallWinogradLayerDataset final : public ConvolutionLayerDataset
+class SmallWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
 {
 public:
-    SmallWinogradLayerDataset()
+    SmallWinogradConvolutionLayer3x3Dataset()
     {
         // Kernel size 3
         // Batch size 1
@@ -48,8 +48,14 @@ public:
         // Batch size 4
         add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(21U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
 
-        // Kernel size 5
+class SmallWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset
+{
+public:
+    SmallWinogradConvolutionLayer5x5Dataset()
+    {
         add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U, 1U), TensorShape(1U), TensorShape(4U, 4U, 1U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 2, 2));
     }
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
new file mode 100644
index 0000000000..c42d6c8ebd
--- /dev/null
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+#define ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class WinogradOutputTransformDataset
+{
+public:
+    using type = std::tuple<TensorShape, Size2D, Size2D, Size2D, DataLayout>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator a_it,
+                 std::vector<Size2D>::const_iterator      b_it,
+                 std::vector<Size2D>::const_iterator      c_it,
+                 std::vector<Size2D>::const_iterator      d_it,
+                 std::vector<DataLayout>::const_iterator  data_layout_it)
+            : _a_it{ std::move(a_it) },
+              _b_it{ std::move(b_it) },
+              _c_it{ std::move(c_it) },
+              _d_it{ std::move(d_it) },
+              _data_layout_it{ std::move(data_layout_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "Input=" << *_a_it << ":";
+            description << "KernelDims=" << *_b_it << ":";
+            description << "OutputDims=" << *_c_it << ":";
+            description << "NumTiles=" << *_d_it << ":";
+            description << "DataLayout=" << *_data_layout_it;
+            return description.str();
+        }
+
+        WinogradOutputTransformDataset::type operator*() const
+        {
+            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_d_it, *_data_layout_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_a_it;
+            ++_b_it;
+            ++_c_it;
+            ++_d_it;
+            ++_data_layout_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _a_it;
+        std::vector<Size2D>::const_iterator      _b_it;
+        std::vector<Size2D>::const_iterator      _c_it;
+        std::vector<Size2D>::const_iterator      _d_it;
+        std::vector<DataLayout>::const_iterator  _data_layout_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_a_shapes.begin(), _b_dims.begin(), _c_dims.begin(), _d_dims.begin(), _data_layout.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_a_shapes.size(), std::min(_b_dims.size(), std::min(_c_dims.size(), std::min(_d_dims.size(), _data_layout.size()))));
+    }
+
+    void add_config(TensorShape a, Size2D b, Size2D c, Size2D d, DataLayout data_layout)
+    {
+        _a_shapes.emplace_back(std::move(a));
+        _b_dims.emplace_back(std::move(b));
+        _c_dims.emplace_back(std::move(c));
+        _d_dims.emplace_back(std::move(d));
+        _data_layout.emplace_back(std::move(data_layout));
+    }
+
+protected:
+    WinogradOutputTransformDataset()                                  = default;
+    WinogradOutputTransformDataset(WinogradOutputTransformDataset &&) = default;
+
+private:
+    std::vector<TensorShape> _a_shapes{};
+    std::vector<Size2D>      _b_dims{};
+    std::vector<Size2D>      _c_dims{};
+    std::vector<Size2D>      _d_dims{};
+    std::vector<DataLayout>  _data_layout{};
+};
+
+class SmallWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+    SmallWinogradOutputTransformDataset()
+    {
+        add_config(TensorShape(24U, 49U, 16U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+        add_config(TensorShape(13U, 6U, 16U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+        add_config(TensorShape(7U, 20U, 16U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+        add_config(TensorShape(24U, 49U, 16U, 3U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+        add_config(TensorShape(13U, 6U, 16U, 2U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+        add_config(TensorShape(7U, 20U, 16U, 5U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+    }
+};
+
+class LargeWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+    LargeWinogradOutputTransformDataset()
+    {
+        add_config(TensorShape(128U, 3136U, 16U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+        add_config(TensorShape(256U, 784U, 16U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+        add_config(TensorShape(512U, 169U, 16U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+        add_config(TensorShape(128U, 3136U, 16U, 3U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+        add_config(TensorShape(256U, 784U, 16U, 2U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+        add_config(TensorShape(512U, 169U, 16U, 5U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET */
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 0b21ed2577..aa668fa575 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -22,17 +22,22 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
 #include "tests/datasets/WinogradFilterTransformDataset.h"
 #include "tests/datasets/WinogradInputTransformDataset.h"
+#include "tests/datasets/WinogradOutputTransformDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -47,7 +52,7 @@ namespace validation
 {
 namespace
 {
-constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
@@ -65,9 +70,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Strides not supported
-                                                                                        TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32),         // valid
-                                                                                        TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32),     // valid
-                                                                                        TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32)         // valid
+                                                                                        TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32),         // Padding needed
+                                                                                        TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32),     // Padding needed
+                                                                                        TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32)         // Padding needed
                                                                                     }),
                                                 framework::dataset::make("OutputInfo", {
                                                                                         TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16),
@@ -96,7 +101,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                                                         Size2D(3U, 3U),
                                                                                         Size2D(3U, 3U)
                                                                                     })),
-                                                framework::dataset::make("Expected", { false, false, false, false, true, true, true })),
+                                                framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
                                             input_info, output_info, conv_info, kernel_dims, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, kernel_dims)) == expected, framework::LogLevel::ERRORS);
@@ -203,8 +208,172 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::Da
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE_END() // FilterTransform
 
+TEST_SUITE(OutputTransform)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                                                framework::dataset::make("InputInfo",{
+                                                                                        TensorInfo(TensorShape(24U, 49U, 16U, 5U), 1, DataType::F16),        // F16 not supported
+                                                                                        TensorInfo(TensorShape(128U, 3136U, 16U, 5U), 1, DataType::QASYMM8), // QASYMM8 not supported
+                                                                                        TensorInfo(TensorShape(256U, 784U, 16U, 5U), 1, DataType::F32),      // Kernel size not supported
+                                                                                        TensorInfo(TensorShape(512U, 169U, 16U, 5U), 1, DataType::F32),      // Valid
+                                                                                        TensorInfo(TensorShape(13U, 6U, 16U, 4U), 1, DataType::F32),         // Padding needed
+                                                                                        TensorInfo(TensorShape(7U, 16U, 16U, 7U), 1, DataType::F32),         // Valid
+                                                                                        TensorInfo(TensorShape(1U, 442U, 16U, 37U), 1, DataType::F32)        // Wrong number of tiles
+                                                                                    }),
+                                                framework::dataset::make("BiasInfo", {
+                                                                                        TensorInfo(TensorShape(24U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(128U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(256U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(13U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(1U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(14U, 14U, 24U, 5U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(112U, 112U, 128U, 5U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(55U, 55U, 256U, 5U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(26U, 26U, 512U, 5U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 4U, 13U, 4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(8U, 8U, 7U, 7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(51U, 33U, 1U, 37U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("KernelDims", {
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(5U, 5U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U)
+                                                                                    })),
+                                                framework::dataset::make("OutputDims", {
+                                                                                        Size2D(14U, 14U),
+                                                                                        Size2D(112U, 112U),
+                                                                                        Size2D(55U, 55U),
+                                                                                        Size2D(26U, 26U),
+                                                                                        Size2D(5U, 4U),
+                                                                                        Size2D(8U, 8U),
+                                                                                        Size2D(51U, 33U)
+                                                                                    })),
+                                                framework::dataset::make("NumTiles", {
+                                                                                        Size2D(7U, 7U),
+                                                                                        Size2D(56U, 56U),
+                                                                                        Size2D(28U, 28U),
+                                                                                        Size2D(13U, 13U),
+                                                                                        Size2D(3U, 2U),
+                                                                                        Size2D(4U, 4U),
+                                                                                        Size2D(26U, 16U)
+                                                                                    })),
+                                                framework::dataset::make("Expected", { false, false, false, true, false, true, false })),
+                                            input_info, bias_info, output_info, kernel_dims, output_dims, num_tiles, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), kernel_dims, output_dims, num_tiles)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradOutputTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
+using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()),
+                                                                   framework::dataset::make("DataType", { DataType::F32 })),
+               shape_a, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type)
+{
+    TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), output_convolved_dims, data_layout);
+
+    // Create tensors
+    CLTensor a = create_tensor<CLTensor>(shape_a, data_type);
+    CLTensor b = create_tensor<CLTensor>(shape_b, data_type);
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLWinogradOutputTransform winograd_output_transform;
+    winograd_output_transform.configure(&a, nullptr, &b, kernel_dims, output_convolved_dims, num_tiles);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // OutputTransform
+
+TEST_SUITE(ConvolutionLayer)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+                                                framework::dataset::make("InputInfo", {
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // FP16 not supported
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
+                                                                                        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+                                                                                        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
+                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
+                                                                                      }),
+                                                framework::dataset::make("WeightsInfo", {
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+                                                                                        })),
+                                                framework::dataset::make("BiasesInfo", {
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32)
+                                                                                       })),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+                                                                                       })),
+                                                framework::dataset::make("ConvInfo", {
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 2, 0, 0),
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 1, 1, 0)
+                                                                                                                 })),
+                                                framework::dataset::make("Expected", { false, false, false, false, false })),
+               input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                                                                                               framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // ConvolutionLayer
+
 TEST_SUITE_END() // Winograd
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 59db279ac7..34306b381c 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -109,10 +109,12 @@ TEST_SUITE_END()
 
 TEST_SUITE(WinogradLayer)
 template <typename T>
-using NEWinogradLayerFixture = WinogradLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
+using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradLayerFixture<float>, framework::DatasetMode::PRECOMMIT, datasets::SmallWinogradLayerDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                                                                                                      datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                                                                                                                      framework::dataset::make("DataType", { DataType::F32 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h
index bfe1efce3b..9811c28008 100644
--- a/tests/validation/fixtures/WinogradLayerFixture.h
+++ b/tests/validation/fixtures/WinogradLayerFixture.h
@@ -48,14 +48,14 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WinogradLayerValidationFixture : public framework::Fixture
+class WinogradConvolutionLayerValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, DataType data_type)
     {
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
     }
 
 protected:
@@ -79,13 +79,14 @@ protected:
         }
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                              DataType data_type)
     {
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, DataType::F32, 1);
-        TensorType weights = create_tensor<TensorType>(weights_shape, DataType::F32, 1);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, DataType::F32, 1);
-        TensorType dst     = create_tensor<TensorType>(output_shape, DataType::F32, 1);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1);
 
         // Create and configure function
         FunctionType conv;
@@ -111,20 +112,20 @@ protected:
         fill(AccessorType(src), 0, -1.f, 1.f);
         fill(AccessorType(weights), 1, -1.f, 1.f);
         fill(AccessorType(bias), 2, -1.f, 1.f);
-        fill(AccessorType(dst), 3, -1.f, 1.f);
 
-        // Compute NEWinogradLayer function
+        // Compute Winograd Convolution function
         conv.run();
 
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      DataType data_type)
     {
         // Create reference
-        SimpleTensor<T> src{ input_shape, DataType::F32, 1 };
-        SimpleTensor<T> weights{ weights_shape, DataType::F32, 1 };
-        SimpleTensor<T> bias{ bias_shape, DataType::F32, 1 };
+        SimpleTensor<T> src{ input_shape, data_type, 1 };
+        SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+        SimpleTensor<T> bias{ bias_shape, data_type, 1 };
 
         // Fill reference
         fill(src, 0, -1.f, 1.f);
@@ -136,8 +137,6 @@ protected:
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
-    int             _fractional_bits{};
-    DataType        _data_type{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -178,7 +177,6 @@ protected:
     {
         ARM_COMPUTE_UNUSED(is_nchw_format);
 
-        // Create tensors
         TensorType src = create_tensor<TensorType>(input_shape, data_type);
         TensorType dst = create_tensor<TensorType>(output_shape, data_type);
 
@@ -261,8 +259,8 @@ protected:
         ARM_COMPUTE_UNUSED(is_nchw_format);
 
         // Create tensors
-        TensorType src = create_tensor<TensorType>(input_shape, data_type);
-        TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
 
         // Create and configure function
         FunctionType filter_transform;
@@ -288,7 +286,7 @@ protected:
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, bool is_nchw_format, DataType data_type)
     {
-        ARM_COMPUTE_ERROR_ON(!is_nchw_format);
+        ARM_COMPUTE_UNUSED(is_nchw_format);
 
         // Create reference
         SimpleTensor<T> src{ input_shape, data_type, 1 };
@@ -302,6 +300,86 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradOutputTransformValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, Size2D kernel_dims, Size2D output_convolved_dims, Size2D num_tiles, DataLayout data_layout, DataType data_type)
+    {
+        TensorShape output_shape = compute_winograd_output_transform_shape(TensorInfo(input_shape, 1, data_type), output_convolved_dims, data_layout);
+
+        _target    = compute_target(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+        _reference = compute_reference(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles, DataLayout data_layout,
+                              DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+        // Create and configure function
+        FunctionType output_transform;
+        output_transform.configure(&src, nullptr, &dst, kernel_dims, output_convolved_dims, num_tiles);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+
+        output_transform.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles,
+                                      DataLayout data_layout,
+                                      DataType   data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+
+        return reference::winograd_output_transform<T>(src, output_shape, kernel_dims, num_tiles);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 24bbf32a30..f3db274935 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -118,4 +118,4 @@ template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &sr
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 3ed55fb9fc..c760663b22 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -38,6 +38,87 @@ namespace reference
 {
 namespace
 {
+template <typename T>
+void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+{
+    // Simple tensor for the 3x3 input tile
+    SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+
+    // Simple tensor for the 4x3 temporary tile
+    SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+    // Simple tensor for the 4x4 output tile
+    SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+
+    // Initialize transformation matrix
+    // 1   | 0   | 0
+    // 0.5 | 0.5 | 0.5
+    // 0.5 |-0.5 | 0.5
+    // 0   | 0   | 1
+    trans_matrix[0 + 0 * 3] = 1.0f;
+    trans_matrix[1 + 0 * 3] = 0.0f;
+    trans_matrix[2 + 0 * 3] = 0.0f;
+    trans_matrix[0 + 1 * 3] = 0.5f;
+    trans_matrix[1 + 1 * 3] = 0.5f;
+    trans_matrix[2 + 1 * 3] = 0.5f;
+    trans_matrix[0 + 2 * 3] = 0.5f;
+    trans_matrix[1 + 2 * 3] = -0.5f;
+    trans_matrix[2 + 2 * 3] = 0.5f;
+    trans_matrix[0 + 3 * 3] = 0.0f;
+    trans_matrix[1 + 3 * 3] = 0.0f;
+    trans_matrix[2 + 3 * 3] = 1.0f;
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int num_channels = in.shape()[2];
+    const int num_filters  = in.shape()[3];
+    const int num_batches  = in.shape().total_size() / (9 * num_channels * num_filters);
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int w = 0; w < num_filters; ++w)
+        {
+            for(int z = 0; z < num_channels; ++z)
+            {
+                // Load the 3x3 tile from the input tensor
+                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+                // Store the 4x4 output tile across the 16 channels
+                const int output_offset                              = w + z * num_filters;
+                out[output_offset + 0 * num_filters * num_channels]  = output_tile[0 + 0 * 4];
+                out[output_offset + 1 * num_filters * num_channels]  = output_tile[1 + 0 * 4];
+                out[output_offset + 2 * num_filters * num_channels]  = output_tile[2 + 0 * 4];
+                out[output_offset + 3 * num_filters * num_channels]  = output_tile[3 + 0 * 4];
+                out[output_offset + 4 * num_filters * num_channels]  = output_tile[0 + 1 * 4];
+                out[output_offset + 5 * num_filters * num_channels]  = output_tile[1 + 1 * 4];
+                out[output_offset + 6 * num_filters * num_channels]  = output_tile[2 + 1 * 4];
+                out[output_offset + 7 * num_filters * num_channels]  = output_tile[3 + 1 * 4];
+                out[output_offset + 8 * num_filters * num_channels]  = output_tile[0 + 2 * 4];
+                out[output_offset + 9 * num_filters * num_channels]  = output_tile[1 + 2 * 4];
+                out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
+                out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
+                out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
+                out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
+                out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
+                out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+            }
+        }
+    }
+}
+
 template <typename T>
 void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
 {
@@ -112,56 +193,70 @@ void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &ds
 }
 
 template <typename T>
-void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
 {
+    ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
+    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
     // Simple tensor for the 3x3 input tile
-    SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+    SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
 
     // Simple tensor for the transformation matrix
-    SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+    SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
 
     // Simple tensor for the transformation matrix transpose
-    SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
 
     // Simple tensor for the 4x3 temporary tile
-    SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+    SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
 
     // Simple tensor for the 4x4 output tile
-    SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+    SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
 
     // Initialize transformation matrix
-    // 1   | 0   | 0
-    // 0.5 | 0.5 | 0.5
-    // 0.5 |-0.5 | 0.5
-    // 0   | 0   | 1
-    trans_matrix[0 + 0 * 3] = 1.0f;
-    trans_matrix[1 + 0 * 3] = 0.0f;
-    trans_matrix[2 + 0 * 3] = 0.0f;
-    trans_matrix[0 + 1 * 3] = 0.5f;
-    trans_matrix[1 + 1 * 3] = 0.5f;
-    trans_matrix[2 + 1 * 3] = 0.5f;
-    trans_matrix[0 + 2 * 3] = 0.5f;
-    trans_matrix[1 + 2 * 3] = -0.5f;
-    trans_matrix[2 + 2 * 3] = 0.5f;
-    trans_matrix[0 + 3 * 3] = 0.0f;
-    trans_matrix[1 + 3 * 3] = 0.0f;
-    trans_matrix[2 + 3 * 3] = 1.0f;
+    // 1   | 1   | 1   | 1
+    // 0   | 1   | -1  | -1
+    trans_matrix[0 + 0 * 4] = 1.0f;
+    trans_matrix[1 + 0 * 4] = 1.0f;
+    trans_matrix[2 + 0 * 4] = 1.0f;
+    trans_matrix[3 + 0 * 4] = 0.0f;
+    trans_matrix[0 + 1 * 4] = 0.0f;
+    trans_matrix[1 + 1 * 4] = 1.0f;
+    trans_matrix[2 + 1 * 4] = -1.0f;
+    trans_matrix[3 + 1 * 4] = -1.0f;
 
     // Transpose the transformation matrix
     transpose_matrix(trans_matrix, trans_matrix_transposed);
 
-    const int num_channels = in.shape()[2];
-    const int num_filters  = in.shape()[3];
-    const int num_batches  = in.shape().total_size() / (9 * num_channels * num_filters);
+    const int w_in        = in.shape()[0];
+    const int h_in        = in.shape()[1];
+    const int c_in        = in.shape()[2];
+    const int w_out       = out.shape()[0];
+    const int h_out       = out.shape()[1];
+    const int c_out       = out.shape()[2];
+    const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+    // Input strides
+    const int stridey_in = w_in;
+    const int stridez_in = stridey_in * h_in;
+    const int stridew_in = stridez_in * c_in;
+
+    // Output strides
+    const int stridey_out = w_out;
+    const int stridez_out = stridey_out * h_out;
+    const int stridew_out = stridez_out * c_out;
 
     for(int n = 0; n < num_batches; ++n)
     {
-        for(int w = 0; w < num_filters; ++w)
+        for(int y = 0; y < h_in; ++y)
         {
-            for(int z = 0; z < num_channels; ++z)
+            for(int x = 0; x < w_in; ++x)
             {
-                // Load the 3x3 tile from the input tensor
-                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+                // Load the 4x4 tile across the 16 channels of the input tensor
+                for(int z = 0; z < c_in; ++z)
+                {
+                    input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+                }
 
                 // First transformation
                 matrix_multiply(trans_matrix, input_tile, tmp_tile);
@@ -169,24 +264,29 @@ void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &ou
                 // Second transformation
                 matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
 
-                // Store the 4x4 output tile across the 16 channels
-                const int output_offset                              = w + z * num_filters;
-                out[output_offset + 0 * num_filters * num_channels]  = output_tile[0 + 0 * 4];
-                out[output_offset + 1 * num_filters * num_channels]  = output_tile[1 + 0 * 4];
-                out[output_offset + 2 * num_filters * num_channels]  = output_tile[2 + 0 * 4];
-                out[output_offset + 3 * num_filters * num_channels]  = output_tile[3 + 0 * 4];
-                out[output_offset + 4 * num_filters * num_channels]  = output_tile[0 + 1 * 4];
-                out[output_offset + 5 * num_filters * num_channels]  = output_tile[1 + 1 * 4];
-                out[output_offset + 6 * num_filters * num_channels]  = output_tile[2 + 1 * 4];
-                out[output_offset + 7 * num_filters * num_channels]  = output_tile[3 + 1 * 4];
-                out[output_offset + 8 * num_filters * num_channels]  = output_tile[0 + 2 * 4];
-                out[output_offset + 9 * num_filters * num_channels]  = output_tile[1 + 2 * 4];
-                out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
-                out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
-                out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
-                out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
-                out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
-                out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+                // Store the 2x2 output tile
+                const int xo = (y % num_tiles_x) * 2;
+                const int yo = (y / num_tiles_x) * 2;
+                const int zo = x;
+
+                const int output_offset                  = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+                out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
+
+                // Check out-of-bound writes
+                if(xo + 1 < w_out)
+                {
+                    out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
+                }
+
+                if(yo + 1 < h_out)
+                {
+                    out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
+                }
+
+                if((yo + 1 < h_out) && (xo + 1 < w_out))
+                {
+                    out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
+                }
             }
         }
     }
@@ -234,8 +334,32 @@ SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const Tenso
     return out;
 }
 
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+    ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
+    ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    switch(kernel_dims.width)
+    {
+        case 3:
+            winograd_output_transform3x3(in, out, num_tiles.width);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
+            break;
+    }
+
+    return out;
+}
+
 template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
 template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index ba8e5c1cb6..fa1a7f3f61 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h
@@ -41,6 +41,9 @@ SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &src, const Tenso
 
 template <typename T>
 SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape);
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
 } // namespace reference
 } // namespace validation
 } // namespace test
-- 
cgit v1.2.1