From 2213d4b334567d0cb7f283090d42b5fb1b70f66b Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 27 Apr 2018 10:39:06 +0100
Subject: COMPMID-1096 - Add fast_math flag to CLConvolutionLayer COMPMID-1103
 - CLWinogradConvolutionLayer mismatches

Change-Id: Iceaa9482a1790ec39d2720c220261aaea8043978
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129398
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../core/NEON/kernels/NEReshapeLayerKernel.h       |   2 +-
 .../runtime/CL/functions/CLConvolutionLayer.h      |  81 +++++++-------
 .../CL/functions/CLWinogradConvolutionLayer.h      |  50 +++++----
 .../runtime/NEON/functions/NEConvolutionLayer.h    |   2 +-
 .../NEON/functions/NEPixelWiseMultiplication.h     |  10 +-
 .../runtime/NEON/functions/NEReshapeLayer.h        |   4 +-
 src/core/CL/cl_kernels/winograd.cl                 |  16 +--
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |  37 +++----
 .../CL/functions/CLWinogradConvolutionLayer.cpp    |  89 ++++++++++-----
 tests/datasets/LargeConvolutionLayerDataset.h      |  19 ++++
 tests/validation/CL/ConvolutionLayer.cpp           | 102 ++++++++++-------
 tests/validation/CL/DilatedConvolutionLayer.cpp    |   6 +-
 tests/validation/CL/Winograd.cpp                   |  24 +++-
 .../fixtures/WinogradConvolutionLayerFixture.h     | 122 ++++++++++++++++++++-
 tests/validation/reference/GEMM.cpp                | 102 ++++++++++++-----
 tests/validation/reference/Winograd.cpp            |   7 +-
 tests/validation/reference/Winograd.h              |   2 +-
 17 files changed, 477 insertions(+), 198 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index cd70198d6c..0a3fc44881 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -40,7 +40,7 @@ public:
     }
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+     * @param[in]  input  Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32
      * @param[out] output Destination tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index a1cd15515f..5c05334a56 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -37,6 +37,7 @@ namespace arm_compute
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
  *
  * -# @ref CLGEMMConvolutionLayer
+ * -# @ref CLWinogradConvolutionLayer
  * -# @ref CLDirectConvolutionLayer
  */
 class CLConvolutionLayer : public IFunction
@@ -46,57 +47,63 @@ public:
     CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QS8/QASYMM8/QS16/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                              Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                                available which may introduce a drop of accuracy as well. Default is false
      */
     void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
-     * @param[in] input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QS8/QASYMM8/QS16/F16/F32.
-     * @param[in] weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in] biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[in] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                         Data types supported: Same as @p input.
-     * @param[in] conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
-     * @param[in] dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] weights_info     Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
-     * @param[in] input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QS8/QASYMM8/QS16/F16/F32.
-     * @param[in] weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                         Data types supported: Same as @p input.
-     * @param[in] conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
-     * @param[in] act_info     (Optional) Activation layer information in case of a fused activation.
-     * @param[in] gpu_target   Specifies the @p GPUTarget.
-     * @param[in] dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] weights_info     Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in] gpu_target       Specifies the @p GPUTarget.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
      *
      * @return a status
      */
     static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U));
+                                                    const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index 2cf1f77fb4..a27976959c 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -51,38 +51,44 @@ public:
     CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @note: This function only works with 3x3 kernels and unit strides
+     * @note: This function only works with 3x3 and 5x5 kernels along with unit strides
+     * @note  Some Winograd configurations (i.e. F(4x4, 3x3) and F(4x4, 5x5)) are supported only with enable_fast_math = true
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
      */
     void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
      *
-     * @note: This function only works with 3x3 kernels and unit strides
+     * @note: This function only works with 3x3 and 5x5 kernels along with unit strides
+     * @note  Some Winograd configurations (i.e. F(4x4, 3x3) and F(4x4, 5x5)) are supported only with enable_fast_math = true
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index ce9a3ed4f2..b82ba89f7c 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -52,7 +52,7 @@ public:
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QS8/QASYMM8/QS16/F32.
+     *                          Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
      *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 7d5e397e80..ba96ae6cfa 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -37,11 +37,11 @@ class NEPixelWiseMultiplication : public INESimpleFunction
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QS8/S16/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[out]     output          Output tensor. Data types supported: U8/QS8/S16/F16/F32.
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
      * @param[in]      overflow_policy Overflow policy.
@@ -50,9 +50,9 @@ public:
     void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
-     * @param[in] input1          First tensor info input. Data types supported: U8/QS8/S16/F32.
-     * @param[in] input2          Second tensor info input. Data types supported: U8/QS8/S16/F32.
-     * @param[in] output          Output tensor info. Data types supported: U8/QS8/S16/F32.
+     * @param[in] input1          First tensor info input. Data types supported: U8/QS8/S16/F16/F32.
+     * @param[in] input2          Second tensor info input. Data types supported: U8/QS8/S16/F16/F32.
+     * @param[in] output          Output tensor info. Data types supported: U8/QS8/S16/F16/F32.
      * @param[in] scale           Scale to apply after multiplication. Must be positive.
      * @param[in] overflow_policy Overflow policy.
      * @param[in] rounding_policy Rounding policy.
diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index 369f50e147..0bab534ebc 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ class NEReshapeLayer : public INESimpleFunction
 public:
     /** Initialise the kernel's inputs and outputs
      *
-     * @param[in]  input  First tensor input. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+     * @param[in]  input  First tensor input. Data type supported: U8/S8/QS8/QASYMM8//U16/S16/QS16/U32/S32/F16/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
index f40a969ea0..0458e53734 100644
--- a/src/core/CL/cl_kernels/winograd.cl
+++ b/src/core/CL/cl_kernels/winograd.cl
@@ -405,16 +405,16 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw(
     out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
                 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
                 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
     out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
@@ -427,16 +427,16 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw(
     out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
                 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
                 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
     out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 97ef895434..83281e1747 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -43,32 +43,33 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 }
 
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+                                                            enable_fast_math));
 
     switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
-                                                      weights_info, act_info, CLScheduler::get().target(), dilation))
+                                                      weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info);
+            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
-            f->configure(input, weights, biases, output, conv_info);
+            f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
             _function = std::move(f);
             break;
         }
@@ -79,19 +80,18 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
-    //Configure if the parameters match the direct convolution or the gemm-based
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation))
+    switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
-            CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info);
+            CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
             break;
         }
         case ConvolutionMethod::DIRECT:
@@ -115,25 +115,22 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
 }
 
 ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation)
+                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_UNUSED(weights_info);
     ARM_COMPUTE_UNUSED(gpu_target);
 
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    if((input->data_type() == DataType::F32) && (input->data_layout() == DataLayout::NCHW) && (input->dimension(idx_c) > 3) && (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3)
-       && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1) && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)) && (!act_info.enabled()))
+    if(dilation != Size2D(1U, 1U))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
     {
-        return ConvolutionMethod::WINOGRAD;
+        return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
     }
-    return ConvolutionMethod::GEMM;
 }
 
 void CLConvolutionLayer::run()
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 65747cf5d7..5ff4fbceee 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -31,33 +31,69 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, bool enable_fast_math)
+{
+    Size2D output_tile = Size2D{};
+
+    if(kernel_dims == Size2D(3U, 3U))
+    {
+        output_tile = ((input_dims.width <= 4 && input_dims.height <= 4) || !enable_fast_math) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    }
+    else if(kernel_dims == Size2D(5U, 5U))
+    {
+        output_tile = Size2D(4U, 4U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd =
+    {
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+    };
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} // namespace
+
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
       _is_first_run(true), _is_activationlayer_enabled(false)
 {
 }
 
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+                                           bool enable_fast_math)
 {
     // Get indices for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
-    // Input shape
-    const TensorShape  input_shape = input->info()->tensor_shape();
-    const unsigned int input_w     = input->info()->tensor_shape()[idx_width];
-    const unsigned int input_h     = input->info()->tensor_shape()[idx_height];
-
-    // Kernel size
-    const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width];
-    const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height];
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, enable_fast_math);
 
-    //Winograd output tile
-    const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
 
     const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    Size2D(kernel_w, kernel_h),
-                                                    Size2D(input_shape[idx_width], input_shape[idx_height]),
+                                                    kernel_size,
+                                                    input_dims,
                                                     conv_info,
                                                     input->info()->data_layout());
 
@@ -93,27 +129,26 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
 }
 
 Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info)
+                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Get indeces for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    // Input shape
-    const TensorShape  input_shape = input->tensor_shape();
-    const unsigned int input_w     = input->tensor_shape()[idx_width];
-    const unsigned int input_h     = input->tensor_shape()[idx_height];
-
-    // Kernel size
-    const unsigned int kernel_w = weights->tensor_shape()[idx_width];
-    const unsigned int kernel_h = weights->tensor_shape()[idx_height];
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, enable_fast_math);
 
-    //Winograd output tile
-    const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
 
     const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    Size2D(kernel_w, kernel_h),
-                                                    Size2D(input_shape[idx_width], input_shape[idx_height]),
+                                                    kernel_size,
+                                                    input_dims,
                                                     conv_info,
                                                     input->data_layout());
 
@@ -139,7 +174,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
     // Validate Activation Layer
     if(act_info.enabled())
     {
-        CLActivationLayer::validate(output, nullptr, act_info);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
     }
 
     return Status{};
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index ec8e09fa81..36b3d60d57 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -59,6 +59,25 @@ public:
     }
 };
 
+class LargeWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer5x5Dataset()
+    {
+        // Kernel size 5
+        // Batch size 1
+        add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(222U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(121U, 130U, 7U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2));
+
+        // Batch size 2, 3 and 4
+        add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(222U, 222U, 64U, 2U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(121U, 130U, 7U, 3U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
+    }
+};
+
 class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset
 {
 public:
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 8685e5bbc7..a2b55a8555 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -73,44 +73,72 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(CL)
 TEST_SUITE(ConvolutionLayer)
 
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                                                                           framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
-                                                                                                                                 }),
-                                                                                           framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
-                                                                                                                                   })),
-                                                                                       framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
-                                                                                                                              })),
-                                                                                   framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
-                                                                                                            PadStrideInfo(1, 2, 1, 1),
-                                                                                                            PadStrideInfo(1, 1, 0, 0),
-                                                                                                            PadStrideInfo(2, 1, 0, 0),
-                                                                                                            PadStrideInfo(3, 2, 1, 0)
-                                                                                                                        })),
-                                                                               framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
-                                                                                                                       GPUTarget::MIDGARD,
-                                                                                                                       GPUTarget::G71,
-                                                                                                                       GPUTarget::MIDGARD,
-                                                                                                                       GPUTarget::BIFROST
-                                                                                                                     })),
-
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
-               input_info, weights_info, output_info, conv_info, gpu_target, expected)
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+                                                                                                   framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0)
+                                                                                                                                         }),
+                                                                                                   framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0)
+                                                                                                                                           })),
+                                                                                               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0)
+                                                                                                                                      })),
+                                                                                           framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
+                                                                                                                    PadStrideInfo(1, 2, 1, 1),
+                                                                                                                    PadStrideInfo(1, 1, 0, 0),
+                                                                                                                    PadStrideInfo(2, 1, 0, 0),
+                                                                                                                    PadStrideInfo(3, 2, 1, 0),
+                                                                                                                    PadStrideInfo(1, 1, 2, 2),
+                                                                                                                    PadStrideInfo(1, 1, 2, 2)
+                                                                                                                                })),
+                                                                                       framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
+                                                                                                                GPUTarget::MIDGARD,
+                                                                                                                GPUTarget::G71,
+                                                                                                                GPUTarget::MIDGARD,
+                                                                                                                GPUTarget::BIFROST,
+                                                                                                                GPUTarget::BIFROST,
+                                                                                                                GPUTarget::BIFROST
+                                                                                                                             })),
+                                                                                   framework::dataset::make("Dilation",
 {
-    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
-                                                                            &weights_info.clone()->set_is_resizable(false),
-                                                                            &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target);
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(2U, 1U),
+})),
+framework::dataset::make("EnableFastMath", { false, false, false, false, false, true, true })),
+framework::dataset::make("Expected",
+{
+    ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM,
+})),
+input_info, weights_info, output_info, conv_info, gpu_target, dilation, enable_fast_math, expected)
+{
+    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info,
+                                                                            WeightsInfo(),
+                                                                            ActivationLayerInfo(),
+                                                                            gpu_target,
+                                                                            dilation,
+                                                                            enable_fast_math);
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END()
diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index e6a765bbe1..9ee002cc5a 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp
@@ -104,9 +104,9 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                                                            framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, gpu_target, dilation, expected)
 {
-    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
-                                                                            &weights_info.clone()->set_is_resizable(false),
-                                                                            &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation);
+    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation);
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END()
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 30d8d751af..d892c9f77f 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -51,7 +51,8 @@ namespace validation
 {
 namespace
 {
-constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
+constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);
+constexpr AbsoluteTolerance<float> tolerance_fast_math_f32(0.1f);
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
@@ -379,6 +380,27 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::D
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE(EnableFastMath)
+using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(), datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fast_math_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(framework::dataset::concat(datasets::LargeWinogradConvolutionLayer3x3Dataset(), datasets::LargeWinogradConvolutionLayer5x5Dataset()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fast_math_f32);
+}
+
+TEST_SUITE_END() // EnableFastMath
 TEST_SUITE_END() // ConvolutionLayer
 
 TEST_SUITE_END() // Winograd
diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
index 249f9d5649..e15931eafb 100644
--- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
@@ -35,6 +35,7 @@
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/GEMM.h"
 #include "tests/validation/reference/Utils.h"
 #include "tests/validation/reference/Winograd.h"
 
@@ -152,6 +153,123 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type, ActivationLayerInfo act_info)
+    {
+        ARM_COMPUTE_UNUSED(dilation);
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                              DataType data_type, ActivationLayerInfo act_info)
+    {
+        // Create tensors
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1);
+
+        // Create and configure function
+        FunctionType conv;
+        ARM_COMPUTE_EXPECT(static_cast<bool>(conv.validate(src.info(), weights.info(), bias.info(), dst.info(), info, act_info, true /* Enable fast math */)), framework::LogLevel::ERRORS);
+        conv.configure(&src, &weights, &bias, &dst, info, act_info, true /* Enable fast math */);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        dst.allocator()->allocate();
+        bias.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+        fill(AccessorType(weights), 1, -1.f, 1.f);
+        fill(AccessorType(bias), 2, -1.f, 1.f);
+
+        // Compute Winograd Convolution function
+        conv.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      DataType data_type, ActivationLayerInfo act_info)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1 };
+        SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+        SimpleTensor<T> bias{ bias_shape, data_type, 1 };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+        fill(weights, 1, -1.f, 1.f);
+        fill(bias, 2, -1.f, 1.f);
+
+        WinogradInfo winograd_info(Size2D(4U, 4U),
+                                   Size2D(weights_shape[0], weights_shape[1]),
+                                   Size2D(input_shape[0], input_shape[1]),
+                                   info,
+                                   src.data_layout());
+
+        // Compute tensor shapes for input, filter and output transforms
+        TensorShape input_transform_shape  = compute_winograd_input_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
+        TensorShape filter_transform_shape = compute_winograd_filter_transform_shape(TensorInfo(weights_shape, 1, data_type), winograd_info);
+        TensorShape batched_gemm_shape     = input_transform_shape;
+        batched_gemm_shape[0]              = filter_transform_shape[0];
+        TensorShape output_transform_shape = compute_winograd_output_transform_shape(TensorInfo(batched_gemm_shape, 1, data_type), winograd_info);
+
+        // Dummy matrix C to perform matrix multiplication
+        SimpleTensor<T> dummy_c{ batched_gemm_shape, data_type, 1 };
+
+        // Compute Winograd-based convolution
+        SimpleTensor<T> input_transform_out  = reference::winograd_input_transform<T>(src, input_transform_shape, winograd_info);
+        SimpleTensor<T> filter_transform_out = reference::winograd_filter_transform<T>(weights, filter_transform_shape, winograd_info);
+        SimpleTensor<T> batched_gemm         = reference::gemm<T>(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f);
+        SimpleTensor<T> conv_out             = reference::winograd_output_transform<T>(batched_gemm, bias, output_transform_shape, winograd_info);
+
+        return (act_info.enabled()) ? reference::activation_layer<T>(conv_out, act_info) : conv_out;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class WinogradInputTransformValidationFixture : public framework::Fixture
 {
@@ -373,11 +491,13 @@ protected:
     {
         // Create reference
         SimpleTensor<T> src{ input_shape, data_type };
+        SimpleTensor<T> bias{ TensorShape(input_shape[0]), data_type };
 
         // Fill reference
         fill(src, 0, -1.f, 1.f);
+        fill(bias, 1, 0.0f, 0.0f); // Fill with zeros as we validate just the output transform without bias contribution
 
-        return reference::winograd_output_transform<T>(src, output_shape, winograd_info);
+        return reference::winograd_output_transform<T>(src, bias, output_shape, winograd_info);
     }
 
     TensorType      _target{};
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 77d025ec8e..f9dcfcbdd0 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,23 +41,44 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
     SimpleTensor<T> dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() };
 
     // Compute reference
-    const int M = dst.shape().y();
-    const int N = dst.shape().x();
+    const int M = a.shape().y();
+    const int N = b.shape().x();
     const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
+
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
 
-    for(int row = 0; row < M; ++row)
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
+
+    for(int w = 0; w < W; ++w)
     {
-        for(int col = 0; col < N; ++col)
+        for(int depth = 0; depth < D; ++depth)
         {
-            T acc(0);
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int k = 0; k < K; ++k)
+            for(int row = 0; row < M; ++row)
             {
-                acc += a[row * K + k] * b[k * N + col];
+                for(int col = 0; col < N; ++col)
+                {
+                    T acc(0);
+
+                    for(int k = 0; k < K; ++k)
+                    {
+                        acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N];
+                    }
+
+                    // Finalize the result: alpha * A * B + beta * C
+                    dst[base_addr_c + col + row * N] = alpha * acc + beta * c[base_addr_c + col + row * N];
+                }
             }
-
-            // Finalize the result: alpha * A * B + beta * C
-            dst[col + row * N] = alpha * acc + beta * c[col + row * N];
         }
     }
 
@@ -75,37 +96,58 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
     // Compute reference
     using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
 
-    const int M                    = dst.shape().y();
-    const int N                    = dst.shape().x();
-    const int K                    = a.shape().x();
-    const int fixed_point_position = a.fixed_point_position();
+    const int M = dst.shape().y();
+    const int N = dst.shape().x();
+    const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
+
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
 
+    const int            fixed_point_position = a.fixed_point_position();
     const fixed_point<T> alpha_q(alpha, fixed_point_position);
     const fixed_point<T> beta_q(beta, fixed_point_position);
 
-    for(int row = 0; row < M; ++row)
+    for(int w = 0; w < W; ++w)
     {
-        for(int col = 0; col < N; ++col)
+        for(int depth = 0; depth < D; ++depth)
         {
-            fixed_point<promoted_type> acc_q(0, fixed_point_position);
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int k = 0; k < K; ++k)
+            for(int row = 0; row < M; ++row)
             {
-                const fixed_point<promoted_type> a0_q(a[row * K + k], fixed_point_position, true);
-                const fixed_point<promoted_type> b0_q(b[k * N + col], fixed_point_position, true);
+                for(int col = 0; col < N; ++col)
+                {
+                    fixed_point<promoted_type> acc_q(0, fixed_point_position);
 
-                acc_q = acc_q + (a0_q * b0_q);
-            }
+                    for(int k = 0; k < K; ++k)
+                    {
+                        const fixed_point<promoted_type> a0_q(a[base_addr_a + row * K + k], fixed_point_position, true);
+                        const fixed_point<promoted_type> b0_q(b[base_addr_b + k * N + col], fixed_point_position, true);
+
+                        acc_q = acc_q + (a0_q * b0_q);
+                    }
 
-            // Finalize the result: alpha * A * B + beta * C
-            const fixed_point<T> c0_q(c[col + row * N], fixed_point_position, true);
+                    // Finalize the result: alpha * A * B + beta * C
+                    const fixed_point<T> c0_q(c[base_addr_c + col + row * N], fixed_point_position, true);
 
-            fixed_point<T> res_q(acc_q);
-            res_q = alpha_q * res_q;
-            res_q = res_q + (beta_q * c0_q);
+                    fixed_point<T> res_q(acc_q);
+                    res_q = alpha_q * res_q;
+                    res_q = res_q + (beta_q * c0_q);
 
-            // Store the result
-            dst[col + row * N] = res_q.raw();
+                    // Store the result
+                    dst[base_addr_c + col + row * N] = res_q.raw();
+                }
+            }
         }
     }
 
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 75b1b51d46..194a78e95f 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -331,7 +331,7 @@ SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const Tenso
 }
 
 template <typename T>
-SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_MSG(winograd_info.output_data_layout != DataLayout::NCHW, "Only supported NCHW data format");
 
@@ -444,6 +444,9 @@ SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const Tenso
                         if((xo + xi < w_out) && (yo + yi < h_out))
                         {
                             out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w];
+
+                            // Add bias
+                            out[output_offset + yi * stridey_out + xi] += b[zo];
                         }
                     }
                 }
@@ -456,7 +459,7 @@ SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const Tenso
 
 template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
-template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const SimpleTensor<float> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index 29181f1142..b74c2c3e29 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h
@@ -51,7 +51,7 @@ template <typename T>
 SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 
 template <typename T>
-SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
-- 
cgit v1.2.1