aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-04-27 10:39:06 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:50:48 +0000
commit2213d4b334567d0cb7f283090d42b5fb1b70f66b (patch)
tree84882854c84af8e184c44a27932ba0c2576ae641
parentebf14a51104205b46c659e042b3077307568c8f6 (diff)
downloadComputeLibrary-2213d4b334567d0cb7f283090d42b5fb1b70f66b.tar.gz
COMPMID-1096 - Add fast_math flag to CLConvolutionLayer
COMPMID-1103 - CLWinogradConvolutionLayer mismatches Change-Id: Iceaa9482a1790ec39d2720c220261aaea8043978 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129398 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h2
-rw-r--r--arm_compute/runtime/CL/functions/CLConvolutionLayer.h81
-rw-r--r--arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h50
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h10
-rw-r--r--arm_compute/runtime/NEON/functions/NEReshapeLayer.h4
-rw-r--r--src/core/CL/cl_kernels/winograd.cl16
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp37
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp89
-rw-r--r--tests/datasets/LargeConvolutionLayerDataset.h19
-rw-r--r--tests/validation/CL/ConvolutionLayer.cpp102
-rw-r--r--tests/validation/CL/DilatedConvolutionLayer.cpp6
-rw-r--r--tests/validation/CL/Winograd.cpp24
-rw-r--r--tests/validation/fixtures/WinogradConvolutionLayerFixture.h122
-rw-r--r--tests/validation/reference/GEMM.cpp102
-rw-r--r--tests/validation/reference/Winograd.cpp7
-rw-r--r--tests/validation/reference/Winograd.h2
17 files changed, 477 insertions, 198 deletions
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index cd70198d6c..0a3fc44881 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -40,7 +40,7 @@ public:
}
/** Set the input and output of the kernel
*
- * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32
* @param[out] output Destination tensor. Data type supported: Same as @p input
*/
void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index a1cd15515f..5c05334a56 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -37,6 +37,7 @@ namespace arm_compute
/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
*
* -# @ref CLGEMMConvolutionLayer
+ * -# @ref CLWinogradConvolutionLayer
* -# @ref CLDirectConvolutionLayer
*/
class CLConvolutionLayer : public IFunction
@@ -46,57 +47,63 @@ public:
CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QS8/QASYMM8/QS16/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
- * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
*/
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QS8/QASYMM8/QS16/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
- * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+ * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
*
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
/** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QS8/QASYMM8/QS16/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
- * @param[in] gpu_target Specifies the @p GPUTarget.
- * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] gpu_target Specifies the @p GPUTarget.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
*
* @return a status
*/
static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U));
+ const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index 2cf1f77fb4..a27976959c 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -51,38 +51,44 @@ public:
CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Set the input and output tensors.
*
- * @note: This function only works with 3x3 kernels and unit strides
+ * @note: This function only works with 3x3 and 5x5 kernels along with unit strides
+ * @note Some Winograd configurations (i.e. F(4x4, 3x3) and F(4x4, 5x5)) are supported only with enable_fast_math = true
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
- * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
*/
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
*
- * @note: This function only works with 3x3 kernels and unit strides
+ * @note: This function only works with 3x3 and 5x5 kernels along with unit strides
+ * @note Some Winograd configurations (i.e. F(4x4, 3x3) and F(4x4, 5x5)) are supported only with enable_fast_math = true
*
- * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
- * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
- * Data types supported: Same as @p input.
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
*
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index ce9a3ed4f2..b82ba89f7c 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -52,7 +52,7 @@ public:
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
* while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QS8/QASYMM8/QS16/F32.
+ * Data types supported: QS8/QASYMM8/QS16/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
* Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 7d5e397e80..ba96ae6cfa 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -37,11 +37,11 @@ class NEPixelWiseMultiplication : public INESimpleFunction
public:
/** Initialise the kernel's inputs, output and convertion policy.
*
- * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/S16/F32.
+ * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/S16/F16/F32.
* The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
* @param[in, out] input2 An input tensor. Data types supported: same as @p input1.
* The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] output Output tensor. Data types supported: U8/QS8/S16/F32.
+ * @param[out] output Output tensor. Data types supported: U8/QS8/S16/F16/F32.
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
* @param[in] overflow_policy Overflow policy.
@@ -50,9 +50,9 @@ public:
void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
/** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
*
- * @param[in] input1 First tensor info input. Data types supported: U8/QS8/S16/F32.
- * @param[in] input2 Second tensor info input. Data types supported: U8/QS8/S16/F32.
- * @param[in] output Output tensor info. Data types supported: U8/QS8/S16/F32.
+ * @param[in] input1 First tensor info input. Data types supported: U8/QS8/S16/F16/F32.
+ * @param[in] input2 Second tensor info input. Data types supported: U8/QS8/S16/F16/F32.
+ * @param[in] output Output tensor info. Data types supported: U8/QS8/S16/F16/F32.
* @param[in] scale Scale to apply after multiplication. Must be positive.
* @param[in] overflow_policy Overflow policy.
* @param[in] rounding_policy Rounding policy.
diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index 369f50e147..0bab534ebc 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,7 @@ class NEReshapeLayer : public INESimpleFunction
public:
/** Initialise the kernel's inputs and outputs
*
- * @param[in] input First tensor input. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] input First tensor input. Data type supported: U8/S8/QS8/QASYMM8//U16/S16/QS16/U32/S32/F16/F32
* @param[out] output Output tensor. Data type supported: Same as @p input
*/
void configure(const ITensor *input, ITensor *output);
diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
index f40a969ea0..0458e53734 100644
--- a/src/core/CL/cl_kernels/winograd.cl
+++ b/src/core/CL/cl_kernels/winograd.cl
@@ -405,16 +405,16 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw(
out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
(16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
(16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
- out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+ out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
(16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
- out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+ out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
(16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
- out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+ out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
(16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
- out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f *
+ out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
(16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
@@ -427,16 +427,16 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw(
out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
(16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
(16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
- out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+ out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
(16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
- out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+ out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
(16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
- out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+ out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
(16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
- out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f *
+ out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
(16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
(16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 97ef895434..83281e1747 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -43,32 +43,33 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
}
void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+ enable_fast_math));
switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
- weights_info, act_info, CLScheduler::get().target(), dilation))
+ weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
{
auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info);
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_function = std::move(f);
break;
}
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
- f->configure(input, weights, biases, output, conv_info);
+ f->configure(input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
case ConvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info, dilation);
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
_function = std::move(f);
break;
}
@@ -79,19 +80,18 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
}
Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- //Configure if the parameters match the direct convolution or the gemm-based
const GPUTarget gpu_target = CLScheduler::get().target();
- switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation))
+ switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
{
//Validate Winograd
- CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info);
+ CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
break;
}
case ConvolutionMethod::DIRECT:
@@ -115,25 +115,22 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
}
ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation)
+ const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_UNUSED(weights_info);
ARM_COMPUTE_UNUSED(gpu_target);
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- if((input->data_type() == DataType::F32) && (input->data_layout() == DataLayout::NCHW) && (input->dimension(idx_c) > 3) && (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3)
- && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1) && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)) && (!act_info.enabled()))
+ if(dilation != Size2D(1U, 1U))
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ else
{
- return ConvolutionMethod::WINOGRAD;
+ return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
}
- return ConvolutionMethod::GEMM;
}
void CLConvolutionLayer::run()
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 65747cf5d7..5ff4fbceee 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -31,33 +31,69 @@
using namespace arm_compute;
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, bool enable_fast_math)
+{
+ Size2D output_tile = Size2D{};
+
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = ((input_dims.width <= 4 && input_dims.height <= 4) || !enable_fast_math) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(4U, 4U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} // namespace
+
CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
_is_first_run(true), _is_activationlayer_enabled(false)
{
}
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
// Get indices for the width and height
const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- // Input shape
- const TensorShape input_shape = input->info()->tensor_shape();
- const unsigned int input_w = input->info()->tensor_shape()[idx_width];
- const unsigned int input_h = input->info()->tensor_shape()[idx_height];
-
- // Kernel size
- const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width];
- const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height];
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, enable_fast_math);
- //Winograd output tile
- const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
const WinogradInfo winograd_info = WinogradInfo(output_tile,
- Size2D(kernel_w, kernel_h),
- Size2D(input_shape[idx_width], input_shape[idx_height]),
+ kernel_size,
+ input_dims,
conv_info,
input->info()->data_layout());
@@ -93,27 +129,26 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
}
Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info)
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
{
// Get indeces for the width and height
const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- // Input shape
- const TensorShape input_shape = input->tensor_shape();
- const unsigned int input_w = input->tensor_shape()[idx_width];
- const unsigned int input_h = input->tensor_shape()[idx_height];
-
- // Kernel size
- const unsigned int kernel_w = weights->tensor_shape()[idx_width];
- const unsigned int kernel_h = weights->tensor_shape()[idx_height];
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, enable_fast_math);
- //Winograd output tile
- const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
const WinogradInfo winograd_info = WinogradInfo(output_tile,
- Size2D(kernel_w, kernel_h),
- Size2D(input_shape[idx_width], input_shape[idx_height]),
+ kernel_size,
+ input_dims,
conv_info,
input->data_layout());
@@ -139,7 +174,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
// Validate Activation Layer
if(act_info.enabled())
{
- CLActivationLayer::validate(output, nullptr, act_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
}
return Status{};
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index ec8e09fa81..36b3d60d57 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -59,6 +59,25 @@ public:
}
};
+class LargeWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset
+{
+public:
+ LargeWinogradConvolutionLayer5x5Dataset()
+ {
+ // Kernel size 5
+ // Batch size 1
+ add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(222U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(121U, 130U, 7U), PadStrideInfo(1, 1, 1, 0));
+ add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0));
+ add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2));
+
+ // Batch size 2, 3 and 4
+ add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(222U, 222U, 64U, 2U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(121U, 130U, 7U, 3U), PadStrideInfo(1, 1, 1, 0));
+ add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0));
+ }
+};
+
class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset
{
public:
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 8685e5bbc7..a2b55a8555 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -73,44 +73,72 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
TEST_SUITE(CL)
TEST_SUITE(ConvolutionLayer)
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
- }),
- framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
- })),
- framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
- })),
- framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
- PadStrideInfo(1, 2, 1, 1),
- PadStrideInfo(1, 1, 0, 0),
- PadStrideInfo(2, 1, 0, 0),
- PadStrideInfo(3, 2, 1, 0)
- })),
- framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
- GPUTarget::MIDGARD,
- GPUTarget::G71,
- GPUTarget::MIDGARD,
- GPUTarget::BIFROST
- })),
-
- framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
- input_info, weights_info, output_info, conv_info, gpu_target, expected)
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0)
+ }),
+ framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0),
+ TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0)
+ })),
+ framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0)
+ })),
+ framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
+ PadStrideInfo(1, 2, 1, 1),
+ PadStrideInfo(1, 1, 0, 0),
+ PadStrideInfo(2, 1, 0, 0),
+ PadStrideInfo(3, 2, 1, 0),
+ PadStrideInfo(1, 1, 2, 2),
+ PadStrideInfo(1, 1, 2, 2)
+ })),
+ framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
+ GPUTarget::MIDGARD,
+ GPUTarget::G71,
+ GPUTarget::MIDGARD,
+ GPUTarget::BIFROST,
+ GPUTarget::BIFROST,
+ GPUTarget::BIFROST
+ })),
+ framework::dataset::make("Dilation",
{
- ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
- &weights_info.clone()->set_is_resizable(false),
- &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target);
+ Size2D(1U, 1U),
+ Size2D(1U, 1U),
+ Size2D(1U, 1U),
+ Size2D(1U, 1U),
+ Size2D(1U, 1U),
+ Size2D(1U, 1U),
+ Size2D(2U, 1U),
+})),
+framework::dataset::make("EnableFastMath", { false, false, false, false, false, true, true })),
+framework::dataset::make("Expected",
+{
+ ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM,
+})),
+input_info, weights_info, output_info, conv_info, gpu_target, dilation, enable_fast_math, expected)
+{
+ ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+ &weights_info.clone()->set_is_resizable(true),
+ &output_info.clone()->set_is_resizable(true), conv_info,
+ WeightsInfo(),
+ ActivationLayerInfo(),
+ gpu_target,
+ dilation,
+ enable_fast_math);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
TEST_SUITE_END()
diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index e6a765bbe1..9ee002cc5a 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp
@@ -104,9 +104,9 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
input_info, weights_info, output_info, conv_info, gpu_target, dilation, expected)
{
- ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
- &weights_info.clone()->set_is_resizable(false),
- &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation);
+ ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+ &weights_info.clone()->set_is_resizable(true),
+ &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
TEST_SUITE_END()
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 30d8d751af..d892c9f77f 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -51,7 +51,8 @@ namespace validation
{
namespace
{
-constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
+constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);
+constexpr AbsoluteTolerance<float> tolerance_fast_math_f32(0.1f);
} // namespace
using namespace arm_compute::misc::shape_calculator;
@@ -379,6 +380,27 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::D
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+TEST_SUITE(EnableFastMath)
+using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+ combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(), datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+ framework::dataset::make("DataType", { DataType::F32 })),
+ framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_fast_math_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+ combine(combine(framework::dataset::concat(datasets::LargeWinogradConvolutionLayer3x3Dataset(), datasets::LargeWinogradConvolutionLayer5x5Dataset()),
+ framework::dataset::make("DataType", { DataType::F32 })),
+ framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_fast_math_f32);
+}
+
+TEST_SUITE_END() // EnableFastMath
TEST_SUITE_END() // ConvolutionLayer
TEST_SUITE_END() // Winograd
diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
index 249f9d5649..e15931eafb 100644
--- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
@@ -35,6 +35,7 @@
#include "tests/validation/Helpers.h"
#include "tests/validation/reference/ActivationLayer.h"
#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/GEMM.h"
#include "tests/validation/reference/Utils.h"
#include "tests/validation/reference/Winograd.h"
@@ -153,6 +154,123 @@ protected:
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixture
+{
+public:
+ template <typename...>
+ void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type, ActivationLayerInfo act_info)
+ {
+ ARM_COMPUTE_UNUSED(dilation);
+
+ _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+ _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+ }
+
+protected:
+ template <typename U>
+ void fill(U &&tensor, int i, float min, float max)
+ {
+ switch(tensor.data_type())
+ {
+ case DataType::F32:
+ {
+ std::uniform_real_distribution<> distribution(min, max);
+ library->fill(tensor, distribution, i);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ library->fill_tensor_uniform(tensor, i);
+ break;
+ }
+ }
+ }
+
+ TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+ DataType data_type, ActivationLayerInfo act_info)
+ {
+ // Create tensors
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+ TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+ TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
+
+ // Create and configure function
+ FunctionType conv;
+ ARM_COMPUTE_EXPECT(static_cast<bool>(conv.validate(src.info(), weights.info(), bias.info(), dst.info(), info, act_info, true /* Enable fast math */)), framework::LogLevel::ERRORS);
+ conv.configure(&src, &weights, &bias, &dst, info, act_info, true /* Enable fast math */);
+
+ ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ weights.allocator()->allocate();
+ dst.allocator()->allocate();
+ bias.allocator()->allocate();
+
+ ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Fill tensors
+ fill(AccessorType(src), 0, -1.f, 1.f);
+ fill(AccessorType(weights), 1, -1.f, 1.f);
+ fill(AccessorType(bias), 2, -1.f, 1.f);
+
+ // Compute Winograd Convolution function
+ conv.run();
+
+ return dst;
+ }
+
+ SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+ DataType data_type, ActivationLayerInfo act_info)
+ {
+ // Create reference
+ SimpleTensor<T> src{ input_shape, data_type, 1 };
+ SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+ SimpleTensor<T> bias{ bias_shape, data_type, 1 };
+
+ // Fill reference
+ fill(src, 0, -1.f, 1.f);
+ fill(weights, 1, -1.f, 1.f);
+ fill(bias, 2, -1.f, 1.f);
+
+ WinogradInfo winograd_info(Size2D(4U, 4U),
+ Size2D(weights_shape[0], weights_shape[1]),
+ Size2D(input_shape[0], input_shape[1]),
+ info,
+ src.data_layout());
+
+ // Compute tensor shapes for input, filter and output transforms
+ TensorShape input_transform_shape = compute_winograd_input_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
+ TensorShape filter_transform_shape = compute_winograd_filter_transform_shape(TensorInfo(weights_shape, 1, data_type), winograd_info);
+ TensorShape batched_gemm_shape = input_transform_shape;
+ batched_gemm_shape[0] = filter_transform_shape[0];
+ TensorShape output_transform_shape = compute_winograd_output_transform_shape(TensorInfo(batched_gemm_shape, 1, data_type), winograd_info);
+
+ // Dummy matrix C to perform matrix multiplication
+ SimpleTensor<T> dummy_c{ batched_gemm_shape, data_type, 1 };
+
+ // Compute Winograd-based convolution
+ SimpleTensor<T> input_transform_out = reference::winograd_input_transform<T>(src, input_transform_shape, winograd_info);
+ SimpleTensor<T> filter_transform_out = reference::winograd_filter_transform<T>(weights, filter_transform_shape, winograd_info);
+ SimpleTensor<T> batched_gemm = reference::gemm<T>(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f);
+ SimpleTensor<T> conv_out = reference::winograd_output_transform<T>(batched_gemm, bias, output_transform_shape, winograd_info);
+
+ return (act_info.enabled()) ? reference::activation_layer<T>(conv_out, act_info) : conv_out;
+ }
+
+ TensorType _target{};
+ SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
class WinogradInputTransformValidationFixture : public framework::Fixture
{
public:
@@ -373,11 +491,13 @@ protected:
{
// Create reference
SimpleTensor<T> src{ input_shape, data_type };
+ SimpleTensor<T> bias{ TensorShape(input_shape[0]), data_type };
// Fill reference
fill(src, 0, -1.f, 1.f);
+ fill(bias, 1, 0.0f, 0.0f); // Fill with zeros as we validate just the output transform without bias contribution
- return reference::winograd_output_transform<T>(src, output_shape, winograd_info);
+ return reference::winograd_output_transform<T>(src, bias, output_shape, winograd_info);
}
TensorType _target{};
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 77d025ec8e..f9dcfcbdd0 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,23 +41,44 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
SimpleTensor<T> dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() };
// Compute reference
- const int M = dst.shape().y();
- const int N = dst.shape().x();
+ const int M = a.shape().y();
+ const int N = b.shape().x();
const int K = a.shape().x();
+ const int D = a.shape().z(); // Number of matrices in a batch
+ const int W = a.shape()[3]; // Number of batched-gemm (Winograd case)
+
+ const int a_stride_z = K * M;
+ const int a_stride_w = K * M * D;
+
+ const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+ const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
- for(int row = 0; row < M; ++row)
+ const int c_stride_z = N * M;
+ const int c_stride_w = N * M * D;
+
+ for(int w = 0; w < W; ++w)
{
- for(int col = 0; col < N; ++col)
+ for(int depth = 0; depth < D; ++depth)
{
- T acc(0);
+ const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+ const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+ const int base_addr_c = depth * c_stride_z + w * c_stride_w;
- for(int k = 0; k < K; ++k)
+ for(int row = 0; row < M; ++row)
{
- acc += a[row * K + k] * b[k * N + col];
+ for(int col = 0; col < N; ++col)
+ {
+ T acc(0);
+
+ for(int k = 0; k < K; ++k)
+ {
+ acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N];
+ }
+
+ // Finalize the result: alpha * A * B + beta * C
+ dst[base_addr_c + col + row * N] = alpha * acc + beta * c[base_addr_c + col + row * N];
+ }
}
-
- // Finalize the result: alpha * A * B + beta * C
- dst[col + row * N] = alpha * acc + beta * c[col + row * N];
}
}
@@ -75,37 +96,58 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
// Compute reference
using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
- const int M = dst.shape().y();
- const int N = dst.shape().x();
- const int K = a.shape().x();
- const int fixed_point_position = a.fixed_point_position();
+ const int M = dst.shape().y();
+ const int N = dst.shape().x();
+ const int K = a.shape().x();
+ const int D = a.shape().z(); // Number of matrices in a batch
+ const int W = a.shape()[3]; // Number of batched-gemm (Winograd case)
+
+ const int a_stride_z = K * M;
+ const int a_stride_w = K * M * D;
+
+ const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+ const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+ const int c_stride_z = N * M;
+ const int c_stride_w = N * M * D;
+ const int fixed_point_position = a.fixed_point_position();
const fixed_point<T> alpha_q(alpha, fixed_point_position);
const fixed_point<T> beta_q(beta, fixed_point_position);
- for(int row = 0; row < M; ++row)
+ for(int w = 0; w < W; ++w)
{
- for(int col = 0; col < N; ++col)
+ for(int depth = 0; depth < D; ++depth)
{
- fixed_point<promoted_type> acc_q(0, fixed_point_position);
+ const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+ const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+ const int base_addr_c = depth * c_stride_z + w * c_stride_w;
- for(int k = 0; k < K; ++k)
+ for(int row = 0; row < M; ++row)
{
- const fixed_point<promoted_type> a0_q(a[row * K + k], fixed_point_position, true);
- const fixed_point<promoted_type> b0_q(b[k * N + col], fixed_point_position, true);
+ for(int col = 0; col < N; ++col)
+ {
+ fixed_point<promoted_type> acc_q(0, fixed_point_position);
- acc_q = acc_q + (a0_q * b0_q);
- }
+ for(int k = 0; k < K; ++k)
+ {
+ const fixed_point<promoted_type> a0_q(a[base_addr_a + row * K + k], fixed_point_position, true);
+ const fixed_point<promoted_type> b0_q(b[base_addr_b + k * N + col], fixed_point_position, true);
+
+ acc_q = acc_q + (a0_q * b0_q);
+ }
- // Finalize the result: alpha * A * B + beta * C
- const fixed_point<T> c0_q(c[col + row * N], fixed_point_position, true);
+ // Finalize the result: alpha * A * B + beta * C
+ const fixed_point<T> c0_q(c[base_addr_c + col + row * N], fixed_point_position, true);
- fixed_point<T> res_q(acc_q);
- res_q = alpha_q * res_q;
- res_q = res_q + (beta_q * c0_q);
+ fixed_point<T> res_q(acc_q);
+ res_q = alpha_q * res_q;
+ res_q = res_q + (beta_q * c0_q);
- // Store the result
- dst[col + row * N] = res_q.raw();
+ // Store the result
+ dst[base_addr_c + col + row * N] = res_q.raw();
+ }
+ }
}
}
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 75b1b51d46..194a78e95f 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -331,7 +331,7 @@ SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const Tenso
}
template <typename T>
-SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info)
{
ARM_COMPUTE_ERROR_ON_MSG(winograd_info.output_data_layout != DataLayout::NCHW, "Only supported NCHW data format");
@@ -444,6 +444,9 @@ SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const Tenso
if((xo + xi < w_out) && (yo + yi < h_out))
{
out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w];
+
+ // Add bias
+ out[output_offset + yi * stridey_out + xi] += b[zo];
}
}
}
@@ -456,7 +459,7 @@ SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const Tenso
template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
-template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const SimpleTensor<float> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index 29181f1142..b74c2c3e29 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h
@@ -51,7 +51,7 @@ template <typename T>
SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
template <typename T>
-SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
} // namespace reference
} // namespace validation
} // namespace test