From 2213d4b334567d0cb7f283090d42b5fb1b70f66b Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 27 Apr 2018 10:39:06 +0100 Subject: COMPMID-1096 - Add fast_math flag to CLConvolutionLayer COMPMID-1103 - CLWinogradConvolutionLayer mismatches Change-Id: Iceaa9482a1790ec39d2720c220261aaea8043978 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129398 Tested-by: Jenkins Reviewed-by: Giorgio Arena Reviewed-by: Georgios Pinitas --- .../core/NEON/kernels/NEReshapeLayerKernel.h | 2 +- .../runtime/CL/functions/CLConvolutionLayer.h | 81 +++++++------- .../CL/functions/CLWinogradConvolutionLayer.h | 50 +++++---- .../runtime/NEON/functions/NEConvolutionLayer.h | 2 +- .../NEON/functions/NEPixelWiseMultiplication.h | 10 +- .../runtime/NEON/functions/NEReshapeLayer.h | 4 +- src/core/CL/cl_kernels/winograd.cl | 16 +-- src/runtime/CL/functions/CLConvolutionLayer.cpp | 37 +++---- .../CL/functions/CLWinogradConvolutionLayer.cpp | 89 ++++++++++----- tests/datasets/LargeConvolutionLayerDataset.h | 19 ++++ tests/validation/CL/ConvolutionLayer.cpp | 102 ++++++++++------- tests/validation/CL/DilatedConvolutionLayer.cpp | 6 +- tests/validation/CL/Winograd.cpp | 24 +++- .../fixtures/WinogradConvolutionLayerFixture.h | 122 ++++++++++++++++++++- tests/validation/reference/GEMM.cpp | 102 ++++++++++++----- tests/validation/reference/Winograd.cpp | 7 +- tests/validation/reference/Winograd.h | 2 +- 17 files changed, 477 insertions(+), 198 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h index cd70198d6c..0a3fc44881 100644 --- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h @@ -40,7 +40,7 @@ public: } /** Set the input and output of the kernel * - * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index a1cd15515f..5c05334a56 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -37,6 +37,7 @@ namespace arm_compute /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions: * * -# @ref CLGEMMConvolutionLayer + * -# @ref CLWinogradConvolutionLayer * -# @ref CLDirectConvolutionLayer */ class CLConvolutionLayer : public IFunction @@ -46,57 +47,63 @@ public: CLConvolutionLayer(std::shared_ptr memory_manager = nullptr); /** Set the input and output tensors. * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QS8/QASYMM8/QS16/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. - * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo()); + const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QS8/QASYMM8/QS16/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. - * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo()); + const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QS8/QASYMM8/QS16/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] gpu_target Specifies the @p GPUTarget. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p input. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] gpu_target Specifies the @p GPUTarget. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false * * @return a status */ static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U)); + const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h index 2cf1f77fb4..a27976959c 100644 --- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h @@ -51,38 +51,44 @@ public: CLWinogradConvolutionLayer(std::shared_ptr memory_manager = nullptr); /** Set the input and output tensors. * - * @note: This function only works with 3x3 kernels and unit strides + * @note: This function only works with 3x3 and 5x5 kernels along with unit strides + * @note Some Winograd configurations (i.e. F(4x4, 3x3) and F(4x4, 5x5)) are supported only with enable_fast_math = true * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input - * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); + const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer * - * @note: This function only works with 3x3 kernels and unit strides + * @note: This function only works with 3x3 and 5x5 kernels along with unit strides + * @note Some Winograd configurations (i.e. F(4x4, 3x3) and F(4x4, 5x5)) are supported only with enable_fast_math = true * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input - * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); + const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index ce9a3ed4f2..b82ba89f7c 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -52,7 +52,7 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QS8/QASYMM8/QS16/F32. + * Data types supported: QS8/QASYMM8/QS16/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index 7d5e397e80..ba96ae6cfa 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -37,11 +37,11 @@ class NEPixelWiseMultiplication : public INESimpleFunction public: /** Initialise the kernel's inputs, output and convertion policy. * - * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/S16/F32. + * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/S16/F16/F32. * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output Output tensor. Data types supported: U8/QS8/S16/F32. + * @param[out] output Output tensor. Data types supported: U8/QS8/S16/F16/F32. * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. * @param[in] overflow_policy Overflow policy. @@ -50,9 +50,9 @@ public: void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication * - * @param[in] input1 First tensor info input. Data types supported: U8/QS8/S16/F32. - * @param[in] input2 Second tensor info input. Data types supported: U8/QS8/S16/F32. - * @param[in] output Output tensor info. Data types supported: U8/QS8/S16/F32. + * @param[in] input1 First tensor info input. Data types supported: U8/QS8/S16/F16/F32. + * @param[in] input2 Second tensor info input. Data types supported: U8/QS8/S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8/QS8/S16/F16/F32. * @param[in] scale Scale to apply after multiplication. Must be positive. * @param[in] overflow_policy Overflow policy. * @param[in] rounding_policy Rounding policy. diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h index 369f50e147..0bab534ebc 100644 --- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h +++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,7 +37,7 @@ class NEReshapeLayer : public INESimpleFunction public: /** Initialise the kernel's inputs and outputs * - * @param[in] input First tensor input. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] input First tensor input. Data type supported: U8/S8/QS8/QASYMM8//U16/S16/QS16/U32/S32/F16/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl index f40a969ea0..0458e53734 100644 --- a/src/core/CL/cl_kernels/winograd.cl +++ b/src/core/CL/cl_kernels/winograd.cl @@ -405,16 +405,16 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw( out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; - out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f * + out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; - out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f * + out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; - out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f * + out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; - out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s0 + w40.s1) + 4.f * + out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f; @@ -427,16 +427,16 @@ __kernel void winograd_filter_transform_4x4_5x5_nchw( out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; - out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f * + out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; - out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f * + out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; - out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f * + out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; - out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s0 + w40.s1) + 4.f * + out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f; diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index 97ef895434..83281e1747 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -43,32 +43,33 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr memory_ma } void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info) + const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, + enable_fast_math)); switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, - weights_info, act_info, CLScheduler::get().target(), dilation)) + weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: { auto f = arm_compute::support::cpp14::make_unique(_memory_manager); - f->configure(input, weights, biases, output, conv_info); + f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); _function = std::move(f); break; } case ConvolutionMethod::DIRECT: { auto f = arm_compute::support::cpp14::make_unique(); - f->configure(input, weights, biases, output, conv_info); + f->configure(input, weights, biases, output, conv_info, act_info); _function = std::move(f); break; } case ConvolutionMethod::GEMM: { auto f = arm_compute::support::cpp14::make_unique(_memory_manager); - f->configure(input, weights, biases, output, conv_info, weights_info, dilation); + f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info); _function = std::move(f); break; } @@ -79,19 +80,18 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c } Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info) + const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - //Configure if the parameters match the direct convolution or the gemm-based const GPUTarget gpu_target = CLScheduler::get().target(); - switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation)) + switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: { //Validate Winograd - CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info); + CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); break; } case ConvolutionMethod::DIRECT: @@ -115,25 +115,22 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo } ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation) + const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_UNUSED(weights_info); ARM_COMPUTE_UNUSED(gpu_target); - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - - if((input->data_type() == DataType::F32) && (input->data_layout() == DataLayout::NCHW) && (input->dimension(idx_c) > 3) && (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3) - && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1) && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)) && (!act_info.enabled())) + if(dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else { - return ConvolutionMethod::WINOGRAD; + return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; } - return ConvolutionMethod::GEMM; } void CLConvolutionLayer::run() diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 65747cf5d7..5ff4fbceee 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -31,33 +31,69 @@ using namespace arm_compute; +namespace +{ +Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, bool enable_fast_math) +{ + Size2D output_tile = Size2D{}; + + if(kernel_dims == Size2D(3U, 3U)) + { + output_tile = ((input_dims.width <= 4 && input_dims.height <= 4) || !enable_fast_math) ? Size2D(2U, 2U) : Size2D(4U, 4U); + } + else if(kernel_dims == Size2D(5U, 5U)) + { + output_tile = Size2D(4U, 4U); + } + + return output_tile; +} + +bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size) +{ + // Check if we want to configure a Winograd configuration which requires fast math + using WinogradConfiguration = std::pair, std::pair>; + + std::vector fast_math_winograd = + { + WinogradConfiguration(std::pair(4, 4), std::pair(3, 3)), + WinogradConfiguration(std::pair(4, 4), std::pair(5, 5)) + }; + + auto p = std::make_pair(std::pair(output_tile.width, output_tile.height), + std::pair(kernel_size.width, kernel_size.height)); + + return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); +} +} // namespace + CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(), _is_first_run(true), _is_activationlayer_enabled(false) { } -void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, + bool enable_fast_math) { // Get indices for the width and height const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - // Input shape - const TensorShape input_shape = input->info()->tensor_shape(); - const unsigned int input_w = input->info()->tensor_shape()[idx_width]; - const unsigned int input_h = input->info()->tensor_shape()[idx_height]; - - // Kernel size - const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width]; - const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height]; + // Input shape, kernel size and output tile + const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, enable_fast_math); - //Winograd output tile - const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U); + // Check if the Winograd configuration requires fast math + if(!enable_fast_math) + { + ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + } const WinogradInfo winograd_info = WinogradInfo(output_tile, - Size2D(kernel_w, kernel_h), - Size2D(input_shape[idx_width], input_shape[idx_height]), + kernel_size, + input_dims, conv_info, input->info()->data_layout()); @@ -93,27 +129,26 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we } Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) + const ActivationLayerInfo &act_info, bool enable_fast_math) { // Get indeces for the width and height const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - // Input shape - const TensorShape input_shape = input->tensor_shape(); - const unsigned int input_w = input->tensor_shape()[idx_width]; - const unsigned int input_h = input->tensor_shape()[idx_height]; - - // Kernel size - const unsigned int kernel_w = weights->tensor_shape()[idx_width]; - const unsigned int kernel_h = weights->tensor_shape()[idx_height]; + // Input shape, kernel size and output tile + const Size2D input_dims = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]); + const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); + const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, enable_fast_math); - //Winograd output tile - const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U); + // Check if the Winograd configuration requires fast math + if(!enable_fast_math) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + } const WinogradInfo winograd_info = WinogradInfo(output_tile, - Size2D(kernel_w, kernel_h), - Size2D(input_shape[idx_width], input_shape[idx_height]), + kernel_size, + input_dims, conv_info, input->data_layout()); @@ -139,7 +174,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen // Validate Activation Layer if(act_info.enabled()) { - CLActivationLayer::validate(output, nullptr, act_info); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } return Status{}; diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h index ec8e09fa81..36b3d60d57 100644 --- a/tests/datasets/LargeConvolutionLayerDataset.h +++ b/tests/datasets/LargeConvolutionLayerDataset.h @@ -59,6 +59,25 @@ public: } }; +class LargeWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset +{ +public: + LargeWinogradConvolutionLayer5x5Dataset() + { + // Kernel size 5 + // Batch size 1 + add_config(TensorShape(224U, 224U, 3U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(222U, 222U, 64U), PadStrideInfo(1, 1, 1, 1)); + add_config(TensorShape(123U, 134U, 16U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(121U, 130U, 7U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(181U, 152U, 42U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U), PadStrideInfo(1, 1, 0, 0)); + add_config(TensorShape(200U, 201U, 24U), TensorShape(5U, 5U, 24U, 61), TensorShape(61U), TensorShape(200U, 201U, 61), PadStrideInfo(1, 1, 2, 2)); + + // Batch size 2, 3 and 4 + add_config(TensorShape(224U, 224U, 3U, 2U), TensorShape(5U, 5U, 3U, 64U), TensorShape(64U), TensorShape(222U, 222U, 64U, 2U), PadStrideInfo(1, 1, 1, 1)); + add_config(TensorShape(123U, 134U, 16U, 3U), TensorShape(5U, 5U, 16U, 7U), TensorShape(7U), TensorShape(121U, 130U, 7U, 3U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(181U, 152U, 42U, 4U), TensorShape(5U, 5U, 42U, 100U), TensorShape(100U), TensorShape(177U, 148U, 100U, 4U), PadStrideInfo(1, 1, 0, 0)); + } +}; + class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset { public: diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp index 8685e5bbc7..a2b55a8555 100644 --- a/tests/validation/CL/ConvolutionLayer.cpp +++ b/tests/validation/CL/ConvolutionLayer.cpp @@ -73,44 +73,72 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo TEST_SUITE(CL) TEST_SUITE(ConvolutionLayer) -DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( - framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0), - TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0), - TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0), - TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0) - }), - framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0), - TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0) - })), - framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0), - TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0), - TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0), - TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0) - })), - framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1), - PadStrideInfo(1, 2, 1, 1), - PadStrideInfo(1, 1, 0, 0), - PadStrideInfo(2, 1, 0, 0), - PadStrideInfo(3, 2, 1, 0) - })), - framework::dataset::make("GpuTarget", { GPUTarget::BIFROST, - GPUTarget::MIDGARD, - GPUTarget::G71, - GPUTarget::MIDGARD, - GPUTarget::BIFROST - })), - - framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })), - input_info, weights_info, output_info, conv_info, gpu_target, expected) +DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0), + TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0), + TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0), + TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0), + TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0), + TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0) + }), + framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0), + TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0), + TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0), + TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0) + })), + framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0), + TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0), + TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0), + TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0), + TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0), + TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0) + })), + framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1), + PadStrideInfo(1, 2, 1, 1), + PadStrideInfo(1, 1, 0, 0), + PadStrideInfo(2, 1, 0, 0), + PadStrideInfo(3, 2, 1, 0), + PadStrideInfo(1, 1, 2, 2), + PadStrideInfo(1, 1, 2, 2) + })), + framework::dataset::make("GpuTarget", { GPUTarget::BIFROST, + GPUTarget::MIDGARD, + GPUTarget::G71, + GPUTarget::MIDGARD, + GPUTarget::BIFROST, + GPUTarget::BIFROST, + GPUTarget::BIFROST + })), + framework::dataset::make("Dilation", { - ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false), - &weights_info.clone()->set_is_resizable(false), - &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target); + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(2U, 1U), +})), +framework::dataset::make("EnableFastMath", { false, false, false, false, false, true, true })), +framework::dataset::make("Expected", +{ + ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, +})), +input_info, weights_info, output_info, conv_info, gpu_target, dilation, enable_fast_math, expected) +{ + ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true), + &weights_info.clone()->set_is_resizable(true), + &output_info.clone()->set_is_resizable(true), conv_info, + WeightsInfo(), + ActivationLayerInfo(), + gpu_target, + dilation, + enable_fast_math); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } TEST_SUITE_END() diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp index e6a765bbe1..9ee002cc5a 100644 --- a/tests/validation/CL/DilatedConvolutionLayer.cpp +++ b/tests/validation/CL/DilatedConvolutionLayer.cpp @@ -104,9 +104,9 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })), input_info, weights_info, output_info, conv_info, gpu_target, dilation, expected) { - ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false), - &weights_info.clone()->set_is_resizable(false), - &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation); + ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true), + &weights_info.clone()->set_is_resizable(true), + &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } TEST_SUITE_END() diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp index 30d8d751af..d892c9f77f 100644 --- a/tests/validation/CL/Winograd.cpp +++ b/tests/validation/CL/Winograd.cpp @@ -51,7 +51,8 @@ namespace validation { namespace { -constexpr AbsoluteTolerance tolerance_f32(0.001f); +constexpr AbsoluteTolerance tolerance_f32(0.0001f); +constexpr AbsoluteTolerance tolerance_fast_math_f32(0.1f); } // namespace using namespace arm_compute::misc::shape_calculator; @@ -379,6 +380,27 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::D // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } +TEST_SUITE(EnableFastMath) +using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(), datasets::SmallWinogradConvolutionLayer5x5Dataset()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_fast_math_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY, + combine(combine(framework::dataset::concat(datasets::LargeWinogradConvolutionLayer3x3Dataset(), datasets::LargeWinogradConvolutionLayer5x5Dataset()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_fast_math_f32); +} + +TEST_SUITE_END() // EnableFastMath TEST_SUITE_END() // ConvolutionLayer TEST_SUITE_END() // Winograd diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h index 249f9d5649..e15931eafb 100644 --- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h +++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h @@ -35,6 +35,7 @@ #include "tests/validation/Helpers.h" #include "tests/validation/reference/ActivationLayer.h" #include "tests/validation/reference/ConvolutionLayer.h" +#include "tests/validation/reference/GEMM.h" #include "tests/validation/reference/Utils.h" #include "tests/validation/reference/Winograd.h" @@ -152,6 +153,123 @@ protected: SimpleTensor _reference{}; }; +template +class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixture +{ +public: + template + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type, ActivationLayerInfo act_info) + { + ARM_COMPUTE_UNUSED(dilation); + + _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info); + _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info); + } + +protected: + template + void fill(U &&tensor, int i, float min, float max) + { + switch(tensor.data_type()) + { + case DataType::F32: + { + std::uniform_real_distribution<> distribution(min, max); + library->fill(tensor, distribution, i); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + library->fill_tensor_uniform(tensor, i); + break; + } + } + } + + TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info, + DataType data_type, ActivationLayerInfo act_info) + { + // Create tensors + TensorType src = create_tensor(input_shape, data_type, 1); + TensorType weights = create_tensor(weights_shape, data_type, 1); + TensorType bias = create_tensor(bias_shape, data_type, 1); + TensorType dst = create_tensor(output_shape, data_type, 1); + + // Create and configure function + FunctionType conv; + ARM_COMPUTE_EXPECT(static_cast(conv.validate(src.info(), weights.info(), bias.info(), dst.info(), info, act_info, true /* Enable fast math */)), framework::LogLevel::ERRORS); + conv.configure(&src, &weights, &bias, &dst, info, act_info, true /* Enable fast math */); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Allocate tensors + src.allocator()->allocate(); + weights.allocator()->allocate(); + dst.allocator()->allocate(); + bias.allocator()->allocate(); + + ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Fill tensors + fill(AccessorType(src), 0, -1.f, 1.f); + fill(AccessorType(weights), 1, -1.f, 1.f); + fill(AccessorType(bias), 2, -1.f, 1.f); + + // Compute Winograd Convolution function + conv.run(); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info, + DataType data_type, ActivationLayerInfo act_info) + { + // Create reference + SimpleTensor src{ input_shape, data_type, 1 }; + SimpleTensor weights{ weights_shape, data_type, 1 }; + SimpleTensor bias{ bias_shape, data_type, 1 }; + + // Fill reference + fill(src, 0, -1.f, 1.f); + fill(weights, 1, -1.f, 1.f); + fill(bias, 2, -1.f, 1.f); + + WinogradInfo winograd_info(Size2D(4U, 4U), + Size2D(weights_shape[0], weights_shape[1]), + Size2D(input_shape[0], input_shape[1]), + info, + src.data_layout()); + + // Compute tensor shapes for input, filter and output transforms + TensorShape input_transform_shape = compute_winograd_input_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info); + TensorShape filter_transform_shape = compute_winograd_filter_transform_shape(TensorInfo(weights_shape, 1, data_type), winograd_info); + TensorShape batched_gemm_shape = input_transform_shape; + batched_gemm_shape[0] = filter_transform_shape[0]; + TensorShape output_transform_shape = compute_winograd_output_transform_shape(TensorInfo(batched_gemm_shape, 1, data_type), winograd_info); + + // Dummy matrix C to perform matrix multiplication + SimpleTensor dummy_c{ batched_gemm_shape, data_type, 1 }; + + // Compute Winograd-based convolution + SimpleTensor input_transform_out = reference::winograd_input_transform(src, input_transform_shape, winograd_info); + SimpleTensor filter_transform_out = reference::winograd_filter_transform(weights, filter_transform_shape, winograd_info); + SimpleTensor batched_gemm = reference::gemm(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f); + SimpleTensor conv_out = reference::winograd_output_transform(batched_gemm, bias, output_transform_shape, winograd_info); + + return (act_info.enabled()) ? reference::activation_layer(conv_out, act_info) : conv_out; + } + + TensorType _target{}; + SimpleTensor _reference{}; +}; + template class WinogradInputTransformValidationFixture : public framework::Fixture { @@ -373,11 +491,13 @@ protected: { // Create reference SimpleTensor src{ input_shape, data_type }; + SimpleTensor bias{ TensorShape(input_shape[0]), data_type }; // Fill reference fill(src, 0, -1.f, 1.f); + fill(bias, 1, 0.0f, 0.0f); // Fill with zeros as we validate just the output transform without bias contribution - return reference::winograd_output_transform(src, output_shape, winograd_info); + return reference::winograd_output_transform(src, bias, output_shape, winograd_info); } TensorType _target{}; diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp index 77d025ec8e..f9dcfcbdd0 100644 --- a/tests/validation/reference/GEMM.cpp +++ b/tests/validation/reference/GEMM.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,23 +41,44 @@ SimpleTensor gemm(const SimpleTensor &a, const SimpleTensor &b, const S SimpleTensor dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() }; // Compute reference - const int M = dst.shape().y(); - const int N = dst.shape().x(); + const int M = a.shape().y(); + const int N = b.shape().x(); const int K = a.shape().x(); + const int D = a.shape().z(); // Number of matrices in a batch + const int W = a.shape()[3]; // Number of batched-gemm (Winograd case) + + const int a_stride_z = K * M; + const int a_stride_w = K * M * D; + + const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions + const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions - for(int row = 0; row < M; ++row) + const int c_stride_z = N * M; + const int c_stride_w = N * M * D; + + for(int w = 0; w < W; ++w) { - for(int col = 0; col < N; ++col) + for(int depth = 0; depth < D; ++depth) { - T acc(0); + const int base_addr_a = depth * a_stride_z + w * a_stride_w; + const int base_addr_b = depth * b_stride_z + w * b_stride_w; + const int base_addr_c = depth * c_stride_z + w * c_stride_w; - for(int k = 0; k < K; ++k) + for(int row = 0; row < M; ++row) { - acc += a[row * K + k] * b[k * N + col]; + for(int col = 0; col < N; ++col) + { + T acc(0); + + for(int k = 0; k < K; ++k) + { + acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N]; + } + + // Finalize the result: alpha * A * B + beta * C + dst[base_addr_c + col + row * N] = alpha * acc + beta * c[base_addr_c + col + row * N]; + } } - - // Finalize the result: alpha * A * B + beta * C - dst[col + row * N] = alpha * acc + beta * c[col + row * N]; } } @@ -75,37 +96,58 @@ SimpleTensor gemm(const SimpleTensor &a, const SimpleTensor &b, const S // Compute reference using promoted_type = fixed_point_arithmetic::traits::promote_t; - const int M = dst.shape().y(); - const int N = dst.shape().x(); - const int K = a.shape().x(); - const int fixed_point_position = a.fixed_point_position(); + const int M = dst.shape().y(); + const int N = dst.shape().x(); + const int K = a.shape().x(); + const int D = a.shape().z(); // Number of matrices in a batch + const int W = a.shape()[3]; // Number of batched-gemm (Winograd case) + + const int a_stride_z = K * M; + const int a_stride_w = K * M * D; + + const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions + const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions + + const int c_stride_z = N * M; + const int c_stride_w = N * M * D; + const int fixed_point_position = a.fixed_point_position(); const fixed_point alpha_q(alpha, fixed_point_position); const fixed_point beta_q(beta, fixed_point_position); - for(int row = 0; row < M; ++row) + for(int w = 0; w < W; ++w) { - for(int col = 0; col < N; ++col) + for(int depth = 0; depth < D; ++depth) { - fixed_point acc_q(0, fixed_point_position); + const int base_addr_a = depth * a_stride_z + w * a_stride_w; + const int base_addr_b = depth * b_stride_z + w * b_stride_w; + const int base_addr_c = depth * c_stride_z + w * c_stride_w; - for(int k = 0; k < K; ++k) + for(int row = 0; row < M; ++row) { - const fixed_point a0_q(a[row * K + k], fixed_point_position, true); - const fixed_point b0_q(b[k * N + col], fixed_point_position, true); + for(int col = 0; col < N; ++col) + { + fixed_point acc_q(0, fixed_point_position); - acc_q = acc_q + (a0_q * b0_q); - } + for(int k = 0; k < K; ++k) + { + const fixed_point a0_q(a[base_addr_a + row * K + k], fixed_point_position, true); + const fixed_point b0_q(b[base_addr_b + k * N + col], fixed_point_position, true); + + acc_q = acc_q + (a0_q * b0_q); + } - // Finalize the result: alpha * A * B + beta * C - const fixed_point c0_q(c[col + row * N], fixed_point_position, true); + // Finalize the result: alpha * A * B + beta * C + const fixed_point c0_q(c[base_addr_c + col + row * N], fixed_point_position, true); - fixed_point res_q(acc_q); - res_q = alpha_q * res_q; - res_q = res_q + (beta_q * c0_q); + fixed_point res_q(acc_q); + res_q = alpha_q * res_q; + res_q = res_q + (beta_q * c0_q); - // Store the result - dst[col + row * N] = res_q.raw(); + // Store the result + dst[base_addr_c + col + row * N] = res_q.raw(); + } + } } } diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp index 75b1b51d46..194a78e95f 100644 --- a/tests/validation/reference/Winograd.cpp +++ b/tests/validation/reference/Winograd.cpp @@ -331,7 +331,7 @@ SimpleTensor winograd_filter_transform(const SimpleTensor &in, const Tenso } template -SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info) +SimpleTensor winograd_output_transform(const SimpleTensor &in, const SimpleTensor &b, const TensorShape &output_shape, const WinogradInfo &winograd_info) { ARM_COMPUTE_ERROR_ON_MSG(winograd_info.output_data_layout != DataLayout::NCHW, "Only supported NCHW data format"); @@ -444,6 +444,9 @@ SimpleTensor winograd_output_transform(const SimpleTensor &in, const Tenso if((xo + xi < w_out) && (yo + yi < h_out)) { out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w]; + + // Add bias + out[output_offset + yi * stridey_out + xi] += b[zo]; } } } @@ -456,7 +459,7 @@ SimpleTensor winograd_output_transform(const SimpleTensor &in, const Tenso template SimpleTensor winograd_filter_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info); template SimpleTensor winograd_input_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info); -template SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info); +template SimpleTensor winograd_output_transform(const SimpleTensor &in, const SimpleTensor &b, const TensorShape &output_shape, const WinogradInfo &winograd_info); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h index 29181f1142..b74c2c3e29 100644 --- a/tests/validation/reference/Winograd.h +++ b/tests/validation/reference/Winograd.h @@ -51,7 +51,7 @@ template SimpleTensor winograd_filter_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info); template -SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const WinogradInfo &winograd_info); +SimpleTensor winograd_output_transform(const SimpleTensor &in, const SimpleTensor &b, const TensorShape &output_shape, const WinogradInfo &winograd_info); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1