From e855c237a5b61c4ed5a5ab79dd4af27385cf72f5 Mon Sep 17 00:00:00 2001 From: Stephen Li Date: Thu, 4 Jan 2018 14:13:22 +0800 Subject: APPBROWSER-377: GCConvoutionLayer support for FP16 Change-Id: I801b5e393a16a9f92c062826e6fcfd5982ca7bb3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116584 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- arm_compute/core/GLES_COMPUTE/GCKernels.h | 3 +- arm_compute/core/GLES_COMPUTE/IGCKernel.h | 2 +- .../core/GLES_COMPUTE/kernels/GCCol2ImKernel.h | 4 +- .../kernels/GCGEMMInterleave4x4Kernel.h | 4 +- .../kernels/GCGEMMTranspose1xWKernel.h | 4 +- .../core/GLES_COMPUTE/kernels/GCIm2ColKernel.h | 23 +- .../GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h | 67 +++ arm_compute/runtime/GLES_COMPUTE/GCFunctions.h | 3 +- .../GLES_COMPUTE/functions/GCConvolutionLayer.h | 138 ++++++ .../GLES_COMPUTE/functions/GCFullyConnectedLayer.h | 2 +- .../GLES_COMPUTE/functions/GCGEMMInterleave4x4.h | 4 +- src/core/GLES_COMPUTE/GCKernelLibrary.cpp | 5 +- src/core/GLES_COMPUTE/IGCKernel.cpp | 2 +- .../GLES_COMPUTE/cs_shaders/convolution_layer.cs | 518 +++++++++++++++++++-- src/core/GLES_COMPUTE/cs_shaders/gemm.cs | 192 +++++++- src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h | 2 +- src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp | 69 ++- .../kernels/GCDepthConcatenateLayerKernel.cpp | 2 +- .../kernels/GCDirectConvolutionLayerKernel.cpp | 2 +- .../GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp | 2 +- .../kernels/GCGEMMInterleave4x4Kernel.cpp | 4 +- .../kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp | 2 +- .../kernels/GCGEMMTranspose1xWKernel.cpp | 4 +- src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp | 118 +++-- .../GLES_COMPUTE/kernels/GCTransposeKernel.cpp | 2 +- .../kernels/GCWeightsReshapeKernel.cpp | 146 ++++++ .../GLES_COMPUTE/functions/GCConvolutionLayer.cpp | 285 ++++++++++++ .../functions/GCFullyConnectedLayer.cpp | 4 +- tests/benchmark/GLES_COMPUTE/ConvolutionLayer.cpp | 119 +++++ .../GoogLeNetInceptionV1ConvolutionLayerDataset.h | 2 +- tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp | 134 ++++++ 31 files changed, 1728 insertions(+), 140 deletions(-) create mode 100644 arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h create mode 100644 arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h mode change 100755 => 100644 src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs mode change 100755 => 100644 src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h create mode 100644 src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp create mode 100644 tests/benchmark/GLES_COMPUTE/ConvolutionLayer.cpp create mode 100644 tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h index 5be44984b2..a1f3c278c4 100644 --- a/arm_compute/core/GLES_COMPUTE/GCKernels.h +++ b/arm_compute/core/GLES_COMPUTE/GCKernels.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -48,5 +48,6 @@ #include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h" #include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h" #include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h" #endif /* __ARM_COMPUTE_GCKERNELS_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h index ce7717e8ea..3b987330da 100644 --- a/arm_compute/core/GLES_COMPUTE/IGCKernel.h +++ b/arm_compute/core/GLES_COMPUTE/IGCKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h index 257ab0eca0..da7c27f1d7 100644 --- a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,7 +72,7 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: F32 + * @param[in] input The input tensor to convert. Data types supported: F16/F32 * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h index b2369a6ad1..6711115c77 100644 --- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -64,7 +64,7 @@ public: GCGEMMInterleave4x4Kernel &operator=(GCGEMMInterleave4x4Kernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: F32 + * @param[in] input Input tensor. Data types supported: F16, F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const IGCTensor *input, IGCTensor *output); diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h index 4223556ac4..48e7a620b8 100644 --- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,7 @@ class GCGEMMTranspose1xWKernel : public IGCSimple2DKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: F32 + * @param[in] input Input tensor. Data types supported: F16, F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const IGCTensor *input, IGCTensor *output); diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h index e1b35607ff..c376a3d17b 100644 --- a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + #ifndef __ARM_COMPUTE_GCIM2COLKERNEL_H__ #define __ARM_COMPUTE_GCIM2COLKERNEL_H__ @@ -29,6 +30,7 @@ namespace arm_compute { class IGCTensor; +class Size2D; /** Interface for the im2col reshape kernel. * @@ -68,18 +70,32 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32 * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, * while every dimension above represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. */ - void configure(const IGCTensor *input, IGCTensor *output, std::pair kernel_dims, const PadStrideInfo &conv_info, bool has_bias); + void configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); // Inherited methods overridden: void run(const Window &window) override; + /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32 + * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, + * while every dimension above represents a batch. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); + private: /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input) * @@ -101,6 +117,7 @@ private: const IGCTensor *_input; IGCTensor *_output; std::pair _convolved_dims; + std::pair _kernel_dims; unsigned int _num_elems_processed_per_iteration; Im2ColFunction _run_func; }; diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h new file mode 100644 index 0000000000..bf315a2f15 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H__ +#define __ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class GCWeightsReshapeKernel : public IGCKernel +{ +public: + /** Constructor.*/ + GCWeightsReshapeKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCWeightsReshapeKernel(const GCWeightsReshapeKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCWeightsReshapeKernel &operator=(const GCWeightsReshapeKernel &) = delete; + /** Allow instances of this class to be moved */ + GCWeightsReshapeKernel(GCWeightsReshapeKernel &&) = default; + /** Allow instances of this class to be moved */ + GCWeightsReshapeKernel &operator=(GCWeightsReshapeKernel &&) = default; + /** Default destructor */ + ~GCWeightsReshapeKernel() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, batches] if unshared. Data types supported: F16, F32 + * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with + * dimensions [OFM, batches] if unshared. Data types supported: Same as @p input + * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. + * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input + */ + void configure(const IGCTensor *input, const IGCTensor *biases, IGCTensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + const IGCTensor *_biases; + IGCTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H__ */ \ No newline at end of file diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h index fa688dbfb6..bbd8218722 100644 --- a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h +++ b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h" #include "arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h" #include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h" #include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h" #include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h" #include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h" diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h new file mode 100644 index 0000000000..e3fa98e6e7 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCCONVOLUTIONLAYER_H__ +#define __ARM_COMPUTE_GCCONVOLUTIONLAYER_H__ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/IFunction.h" + +#include + +namespace arm_compute +{ +class IGCTensor; + +/** Function to reshape and transpose the weights. This function calls the following kernels: + * -# @ref GCWeightsReshapeKernel + * -# @ref GCGEMMTranspose1xWKernel + */ +class GCConvolutionLayerReshapeWeights : public IFunction +{ +public: + /** Constructor */ + GCConvolutionLayerReshapeWeights(); + /** Set the input and output tensors. + * + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: F16/F32. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[out] output Destination tensor. Data types supported: Same as @p weights. + * @param[in] transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise. + * Data types supported: Same as @p weights. + */ + void configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose1xW); + // Inherited methods overridden: + void run() override; + +private: + GCWeightsReshapeKernel _weights_reshape_kernel; + GCGEMMTranspose1xWKernel _weights_transposed_kernel; + GCTensor _weights_reshaped; + bool _transpose1xW; +}; + +/** Basic function to compute the convolution layer. This function calls the following GLES kernels: + * + * -# @ref GCWeightsReshapeKernel (executed only once for each configuration) + * -# @ref GCGEMMTranspose1xWKernel (executed only once for each configuration) + * -# @ref GCIm2ColKernel + * -# @ref GCGEMMInterleave4x4Kernel + * -# @ref GCCol2ImKernel + */ +class GCConvolutionLayer : public IFunction +{ +public: + /** Default constructor */ + GCConvolutionLayer(); + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with GCWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with GCGEMMTranspose1xWKernel. Data type supported: Same as @p input. + */ + void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run() override; + +private: + /** Configures the appropriate matrix multiply routine + * + * @param input Input tensor. Data types supported: F16/F32. + * @param weights Weights tensor. Data type supported: Same as @p input. + * @param output Output tensor. Data types supported: Same as @p input, + * @param is_interleaved_transposed Flag that signals if matrix is interleaved transposed + */ + void configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output, bool is_interleaved_transposed = true); + +private: + GCConvolutionLayerReshapeWeights _reshape_weights; + GCIm2ColKernel _input_im2col_kernel; + GCGEMMInterleave4x4Kernel _input_interleave_kernel; + GCGEMMMatrixMultiplyKernel _mm_kernel; + GCCol2ImKernel _output_col2im_kernel; + GCFillBorderKernel _fill_border; + + GCTensor _input_im2col_reshaped; + GCTensor _input_interleaved_reshaped; + GCTensor _weights_reshaped; + GCTensor _weights_transposed; + GCTensor _gemm_output; + GCTensor _tmp_output; + + bool _append_bias; + bool _is_fully_connected_convolution; + bool _are_weights_reshaped; +}; +} + +#endif /* __ARM_COMPUTE_GCCONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h index 1ae5837de0..3ba44f59cb 100644 --- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h index 48fa7ed504..2c83b13f84 100644 --- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,7 +40,7 @@ class GCGEMMInterleave4x4 : public IGCSimpleFunction public: /** Initialise the kernel's inputs, output * - * @param[in] input First input tensor. Data types supported: F32 + * @param[in] input First input tensor. Data types supported: F32, F16 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const IGCTensor *input, IGCTensor *output); diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp index 4c84c674fc..0b9cd3f4ee 100644 --- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp +++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -190,7 +190,6 @@ void GCKernel::update_shader_params() const std::map GCKernelLibrary::_shader_program_map = { { "absdiff", "absdiff.cs" }, - { "col2im", "convolution_layer.cs" }, { "direct_convolution1x1", "direct_convolution1x1.cs" }, { "direct_convolution3x3", "direct_convolution3x3.cs" }, { "direct_convolution5x5", "direct_convolution5x5.cs" }, @@ -207,9 +206,11 @@ const std::map GCKernelLibrary::_shader_program_map = { "gemm_mm_interleaved_transposed", "gemm.cs" }, { "gemm_mm_floating_point", "gemm.cs" }, { "gemm_transpose1x4", "gemm.cs" }, + { "reshape_to_columns", "convolution_layer.cs" }, { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cs" }, { "im2col_generic", "convolution_layer.cs" }, { "im2col_reduced", "convolution_layer.cs" }, + { "col2im", "convolution_layer.cs" }, { "transpose", "transpose.cs" }, { "activation_layer", "activation_layer.cs" }, { "softmax_layer_max", "softmax_layer.cs" }, diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp index 6666c0f3ae..55b7f0da4a 100644 --- a/src/core/GLES_COMPUTE/IGCKernel.cpp +++ b/src/core/GLES_COMPUTE/IGCKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs old mode 100755 new mode 100644 index 4bfac282e2..2648db08b3 --- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,32 +30,163 @@ layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = precision mediump float; #endif // DATA_TYPE_FP16 +#ifdef RESHAPE_TO_COLUMNS + +/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32" + * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_attrs The attributes of the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_attrs The attributes of the biases tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + * @param[in] depth The depth of the input tensor + * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix + */ + +SHADER_PARAMS_DECLARATION +{ + Tensor3DAttributes src_attrs; + ImageAttributes dst_attrs; +#ifdef HAS_BIAS + VectorAttributes biases_attrs; +#endif /* HAS_BIAS */ + uint width; + uint height; + uint depth; + uint total_filters; +}; + +#if defined(DATA_TYPE_FP16) + +TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); +#ifdef HAS_BIAS +TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly); +#endif /* BIAS */ + +void main() +{ + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); +#ifdef HAS_BIAS + VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift); +#endif /* BIAS */ + + bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1)) + && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1))); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint( + gl_GlobalInvocationID.z) + * uint(width) * uint(height) * uint(dst_attrs.stride_y)))); + // Linearize convolution elements + if(is_last_thread) + { + for(uint i = 0u; i < uint(total_filters); i = i + 2u) + { + vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + vec2 s; + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.x = s0.x; + } + else + { + s.x = s0.y; + } + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); + + vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.y = s1.x; + } + else + { + s.y = s1.y; + } + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); +#ifdef HAS_BIAS + vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter); + STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x)); +#endif /* HAS_BIAS */ + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x)); + } + } + else + { + for(uint i = 0u; i < uint(total_filters); i = i + 2u) + { + vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + vec2 s; + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.x = s0.x; + } + else + { + s.x = s0.y; + } + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); + + vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter); + if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0) + { + s.y = s1.x; + } + else + { + s.y = s1.y; + } + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z)); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x)); + } + } +} + +#endif /* DATA_TYPE_FP16 */ +#endif // RESHAPE_TO_COLUMNS + #ifdef IM2COL_GENERIC + /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx" + * @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx" + * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx" + * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx" + * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx" * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 * @param[in] src_attrs The attributes of the source tensor * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_attrs The attributes of the destination tensor - * @param[in] filter_depth The depth of the used filter * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ + SHADER_PARAMS_DECLARATION { Tensor3DAttributes src_attrs; ImageAttributes dst_attrs; - uint filter_depth; uint src_stride_w; uint dst_stride_w; }; #ifdef DATA_TYPE_FP32 + TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); + void main(void) { Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); @@ -63,64 +194,315 @@ void main(void) uint xc = gl_GlobalInvocationID.x; // x coordinate in the convolved tensor uint yc = gl_GlobalInvocationID.y; // y coordinate in the convolved tensor - uint ch = gl_GlobalInvocationID.z % filter_depth; // input feature map - uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch + uint ch = gl_GlobalInvocationID.z % KERNEL_DEPTH; // input feature map + uint batch = gl_GlobalInvocationID.z / KERNEL_DEPTH; // the batch // Calculate input indeces - uint xi = xc * uint(STRIDE_X) - uint(PAD_X); - uint yi = yc * uint(STRIDE_Y) - uint(PAD_Y); - uint input_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w)); + uint xi = xc * uint(STRIDE_X) - uint(PAD_LEFT); + uint yi = yc * uint(STRIDE_Y) - uint(PAD_TOP); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w)); // Calculate output indeces - uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT); - uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution - uint output_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo); + uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT); + uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo); + + uint src_pos = 0u; // Linearize convolution elements for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y) { - for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x) + for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x, TENSOR_OFFSET_ADVANCE(dst_iter, 1u)) { -#if PAD_X == 0 && PAD_Y == 0 - output_offset = input_offset + ((x * src_attrs.stride_x + y * src_attrs.stride_y) >> uint(2)); - STORE(dst_ptr, output_offset, LOAD(src_ptr, input_offset)); - -#else // PAD_X == 0 && PAD_Y == 0 +#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos)); +#else /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) { - STORE(dst_ptr, output_offset, 0.0f); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f); } else { - output_offset = input_offset + (x * srcs_attrs.stride_x + y * src_attrs.stride_y) >> uint(2)); - STORE(dst_ptr, output_offset, LOAD(src_ptr, input_offset)); + src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos)); } -#endif // PAD_X == 0 && PAD_Y == 0 +#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */ } } #ifdef HAS_BIAS if(ch == (uint(KERNEL_DEPTH) - 1)) { - STORE(dst_ptr, output_offset, 1.0f); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f); } -#endif // HAS_BIAS +#endif /* HAS_BIAS */ } #elif defined(DATA_TYPE_FP16) + TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly); +#ifdef KERNEL_1x1 + +void main(void) +{ + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); + + uint xc = gl_GlobalInvocationID.x; + uint yc = gl_GlobalInvocationID.y; + uint zc = gl_GlobalInvocationID.z; + uint ch = zc % uint(KERNEL_DEPTH); // input feature map + uint batch = zc / uint(KERNEL_DEPTH); // the batch + + // Calculate input indeces + uint xi = xc; + uint yi = yc; + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z); + + // Calculate output indeces + uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x; + uint xo = ch * dst_element_count; + uint yo = xc + yc * uint(CONVOLVED_WIDTH); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo); + + bool x_start_even = ((xc % 2u) == 0u); + bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u); + uint input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y); + uint tmp_left = 0u; + uint tmp_right = 0u; + + if(ch % 2u != 0u) + { + return; + } + + if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1)))) + { + tmp_left = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z); + tmp_right = LOAD(src_ptr, input_pos); + if(x_start_even) + { + tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u); + } + else + { + tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u); + } + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + +#ifdef HAS_BIAS + if(ch == (uint(KERNEL_DEPTH) - 2u)) + { + mediump vec2 bias_vec = vec2(1.f, 0.f); + uint bias_u = packHalf2x16(bias_vec); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u); + } +#endif /* HAS_BIAS */ + } + else + { + tmp_left = LOAD(src_ptr, input_pos); + if(x_start_even) + { + tmp_right = (tmp_left & 0xffffu); + } + else + { + tmp_right = (tmp_left >> 16u); + } + +#ifdef HAS_BIAS + mediump vec2 bias_vec = vec2(0.f, 1.f); + uint bias_u = packHalf2x16(bias_vec); + tmp_right += (bias_u & 0xffff0000u); +#endif /* HAS_BIAS */ + + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + } +} + +#else /* KERNEL_1x1 */ + void main(void) { + uint xc = gl_GlobalInvocationID.x; + uint yc = gl_GlobalInvocationID.y; + uint zc = gl_GlobalInvocationID.z; + uint ch = zc % uint(KERNEL_DEPTH); // input feature map + uint batch = zc / uint(KERNEL_DEPTH); // the batch + + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); + + // Calculate input indeces + uint src_element_count = src_attrs.step_x / src_attrs.stride_x; + uint xi = (xc * uint(STRIDE_X)) / src_element_count; + uint yi = yc * uint(STRIDE_Y); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z); + + // Calculate output indeces + uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x; + uint xo = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count; + uint yo = xc + yc * uint(CONVOLVED_WIDTH); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo); + + bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u); + bool z_start_even = ((ch % 2u) == 0u); + uint input_pos = 0u; + uint tmp = 0u; + uint tmp_left = 0u; + uint tmp_right = 0u; + + // Linearize convolution elements + for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y) + { + uint xstart = 0u; + uint xend = 0u; + + // even col, even row + if(x_start_even) + { + if(((y - yi + ch) % 2u) == 0u) + { + for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos)); + } + } + else + { + // 1st pair + if(!z_start_even && (y == yi)) + { + // cross 2d feature map + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w + + (ch - 1u) * src_attrs.stride_z); + } + else + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, + (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y); + } + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + + // remaining + for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + tmp_right = LOAD(src_ptr, input_pos); + tmp_right = (tmp_left >> 16u) + (tmp_right << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + } + } + } + else + { + if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even)) + { + // 1st pair + if(y == yi) + { + // cross 2d feature map + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w + + (ch - 1u) * src_attrs.stride_z); + } + else + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, + (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y); + } + + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + + // remaining + for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos)); + } + } + else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even)) + { + // 1st pair + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y); + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right >> 16u) + (tmp_left << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x); + + // remaining + for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x)) + { + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + tmp_right = LOAD(src_ptr, input_pos); + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y); + tmp_left = LOAD(src_ptr, input_pos); + tmp_right = (tmp_right >> 16u) + (tmp_left << 16u); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right); + } + } + } + } + + // NOTE: must handle last element manually instead of in loops + // to avoid write conflict across 2d boundary + if(ch == uint(KERNEL_DEPTH) - 1u) + { + uint x = xi + (uint(KERNEL_WIDTH) / 2u); + uint y = yi + uint(KERNEL_HEIGHT) - 1u; + input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y); + tmp = LOAD(src_ptr, input_pos); + if(!x_start_even) + { + tmp = (tmp >> 16u) + (tmp << 16u); + } + +#ifdef HAS_BIAS + mediump vec2 bias_vec = vec2(1.f, 1.f); + uint bias_u = packHalf2x16(bias_vec); + if(z_start_even) + { + tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u); + } + else + { + tmp = (bias_u & 0xffffu); + } +#endif /* HAS_BIAS */ + + STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp); + } } -#else /* DATA_TYPE_FP32 */ +#endif /* KERNEL_1x1 */ +#else /* DATA_TYPE_FP32 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ #endif /* IM2COL_GENERIC */ #ifdef IM2COL_REDUCED + /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" @@ -133,6 +515,7 @@ void main(void) * @param[in] width The width of the input tensor * @param[in] height The height of the input tensor */ + SHADER_PARAMS_DECLARATION { Tensor3DAttributes src_attrs; @@ -142,6 +525,7 @@ SHADER_PARAMS_DECLARATION }; #ifdef DATA_TYPE_FP32 + TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); @@ -181,6 +565,7 @@ TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict); #endif /* IM2COL_REDUCED_8X */ #if defined(IM2COL_REDUCED_GENERIC) + void main(void) { Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); @@ -207,20 +592,20 @@ void main(void) else { // special op - uint tmpleft = uint(0); - uint tmpright = uint(0); - tmpright = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half + uint tmp_left = uint(0); + uint tmp_right = uint(0); + tmp_right = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half if(pos.x == uint(0)) { - tmpleft = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half - tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16)); + tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half + tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16)); } else { - tmpleft = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z))); - tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16))); + tmp_left = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z))); + tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16))); } - STORE(dst_ptr, tmp_out_offset, tmpright); + STORE(dst_ptr, tmp_out_offset, tmp_right); } } else @@ -243,6 +628,7 @@ void main(void) } #else /* IM2COL_REDUCED_GENERIC */ + void main(void) { Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift); @@ -263,46 +649,86 @@ void main(void) STORE(dst_ptr, tmp_out_offset, tmp); #endif /* IM2COL_REDUCED_8X */ } -#endif /* IM2COL_REDUCED_GENERIC */ -#else /* DATA_TYPE_FP32 */ + +#endif /* IM2COL_REDUCED_GENERIC */ +#else /* DATA_TYPE_FP32 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ #endif /* IM2COL_REDUCED */ -#ifdef COL2IM +#ifdef WIDTH_OUTPUT + /** This kernel performs a reshaping of the output of the convolution layer. * * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_attrs The attributes of the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_attrs The attributes of the destination tensor - * @param[in] width The width of output convolved dimensions + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_attrs The attributes of the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination tensor + * @param[in] dst_depth The length of the destination tensor in Z dimension + * @param[in] dst_strideZ The actual stride of the destination tensor in Z dimension */ + SHADER_PARAMS_DECLARATION { - ImageAttributes src_attrs; + Tensor3DAttributes src_attrs; Tensor3DAttributes dst_attrs; - uint width; + uint dst_depth; + uint dst_strideZ; }; #ifdef DATA_TYPE_FP32 + TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly); TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict); + void main(void) { - ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); - uvec2 pos = uvec2(gl_GlobalInvocationID.xy); - uint tmp_out_offset = TENSOR3D_OFFSET(dst_iter, pos.y % width, pos.y / width, pos.x); + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * WIDTH_OUTPUT * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * (src_attrs.stride_z)); - STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter)); + STORE_CURRENT_ITEM(dst_ptr, dst_iter, + LOAD_CURRENT_ITEM(src_ptr, src_iter)); } #elif defined(DATA_TYPE_FP16) +TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly); +TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict); + +void main(void) +{ + Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift); + Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift); + + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + + if((pos.z % dst_depth) % 2u == 0u) + { + uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ; + uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes); + uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y); + vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset); + vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset); + vec2 result = vec2(tmp1.x, tmp2.x); + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result); + } + else + { + uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ - 2u; + uint tmp1_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes); + uint tmp2_in_offset = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y); + vec2 tmp1 = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset); + vec2 tmp2 = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset); + vec2 result = vec2(tmp1.y, tmp2.y); + STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result); + } +} + #else /* DATA_TYPE_FP32 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs index 4beb3ad5d2..c81bed7066 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -383,6 +383,81 @@ void main(void) #elif defined(DATA_TYPE_FP16) +#ifdef GEMM_TRANSPOSE1xW +/** This OpenGL ES kernel computes the "vector" 1x8 transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_attrs The attributes of the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination matrix + */ +SHADER_PARAMS_DECLARATION +{ + ImageAttributes src_attrs; + ImageAttributes dst_attrs; +}; +TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly); +TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); + +void main(void) +{ + /* Compute address for Matrix B - source */ + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift); + + /* Compute address for Matrix B transposed - destination. X and Y are swapped */ + TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y); + + STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter)); +} +#endif /* GEMM_TRANSPOSE1xW */ + +#ifdef GEMM_INTERLEAVE4x4 +/** This OpenGLES kernel reshapes the input matrix interleaving the values + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_attrs The attributes of the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_attrs The attributes of the destination matrix + */ +SHADER_PARAMS_DECLARATION +{ + ImageAttributes src_attrs; + ImageAttributes dst_attrs; +}; +TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly); +TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); + +void main(void) +{ + /* Compute source and destination addresses */ + ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift); + + vec4 s0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter); + vec4 s1[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1)); + vec4 s2[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2)); + vec4 s3[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3)); + + vec4 s[2]; + s[0] = vec4(s0[0].x, s1[0].x, s2[0].x, s3[0].x); + s[1] = vec4(s0[0].y, s1[0].y, s2[0].y, s3[0].y); + STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s); + + s[0] = vec4(s0[0].z, s1[0].z, s2[0].z, s3[0].z); + s[1] = vec4(s0[0].w, s1[0].w, s2[0].w, s3[0].w); + STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 1u), s); + + s[0] = vec4(s0[1].x, s1[1].x, s2[1].x, s3[1].x); + s[1] = vec4(s0[1].y, s1[1].y, s2[1].y, s3[1].y); + STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 2u), s); + + s[0] = vec4(s0[1].z, s1[1].z, s2[1].z, s3[1].z); + s[1] = vec4(s0[1].w, s1[1].w, s2[1].w, s3[1].w); + STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 3u), s); +} +#endif /* GEMM_INTERLEAVE4x4 */ + #ifdef GEMM_MM_FLOATING_POINT /** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication @@ -757,6 +832,119 @@ void main(void) } #endif /* ACCUM_PROCESS_8X */ #endif /* GEMM_ACCUMULATE_BIASES */ -#else /* DATA_TYPE_FP16 */ + +#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED +/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src0_attrs The attributes of the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr + * @param[in] src1_attrs The attributes of the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr + * @param[in] dst_attrs The attributes of the destination matrix + */ +SHADER_PARAMS_DECLARATION +{ + ImageAttributes src0_attrs; + ImageAttributes src1_attrs; + ImageAttributes dst_attrs; +}; +TENSOR_DECLARATION(1, src0Buffer, uvec2, src0_ptr, src0_shift, 3, readonly); +TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly); +TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly); + +void main() +{ + ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift); + ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift); + ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift); + + /* Compute address for matrix A and B */ + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y)); + TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y)); + /* Compute end row address for matrix B */ + int end_row_mtx_b = (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) + int(COLS_B); + + /* Reset accumulators */ + vec4 c00[2]; + vec4 c10[2]; + vec4 c20[2]; + vec4 c30[2]; + c00[0] = vec4(0.0f); + c00[1] = vec4(0.0f); + c10[0] = vec4(0.0f); + c10[1] = vec4(0.0f); + c20[0] = vec4(0.0f); + c20[1] = vec4(0.0f); + c30[0] = vec4(0.0f); + c30[1] = vec4(0.0f); + + // FIXME: loop unrolling really needed for GLES? + for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + vec4 a0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter); + vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter); + + c00[0] += vec4(a0.x) * b0[0]; + c00[1] += vec4(a0.x) * b0[1]; + c10[0] += vec4(a0.y) * b0[0]; + c10[1] += vec4(a0.y) * b0[1]; + c20[0] += vec4(a0.z) * b0[0]; + c20[1] += vec4(a0.z) * b0[1]; + c30[0] += vec4(a0.w) * b0[0]; + c30[1] += vec4(a0.w) * b0[1]; + + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + a0 = LOAD_UNPACK4_HALF(src0_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, 8)); + b0 = LOAD_UNPACK8_HALF(src1_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src1_iter, 16)); + + c00[0] += vec4(a0.x) * b0[0]; + c00[1] += vec4(a0.x) * b0[1]; + c10[0] += vec4(a0.y) * b0[0]; + c10[1] += vec4(a0.y) * b0[1]; + c20[0] += vec4(a0.z) * b0[0]; + c20[1] += vec4(a0.z) * b0[1]; + c30[0] += vec4(a0.w) * b0[0]; + c30[1] += vec4(a0.w) * b0[1]; + } + + for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 16)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + vec4 a0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter); + vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter); + + c00[0] += vec4(a0.x) * b0[0]; + c00[1] += vec4(a0.x) * b0[1]; + c10[0] += vec4(a0.y) * b0[0]; + c10[1] += vec4(a0.y) * b0[1]; + c20[0] += vec4(a0.z) * b0[0]; + c20[1] += vec4(a0.z) * b0[1]; + c30[0] += vec4(a0.w) * b0[0]; + c30[1] += vec4(a0.w) * b0[1]; + } + + /* Multiply by the weight of matrix product */ + c00[0] = c00[0] * vec4(ALPHA); + c00[1] = c00[1] * vec4(ALPHA); + c10[0] = c10[0] * vec4(ALPHA); + c10[1] = c10[1] * vec4(ALPHA); + c20[0] = c20[0] * vec4(ALPHA); + c20[1] = c20[1] * vec4(ALPHA); + c30[0] = c30[0] * vec4(ALPHA); + c30[1] = c30[1] * vec4(ALPHA); + + /* Store 4x8 block */ + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00); + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10); + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20); + STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30); +} +#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */ +#else /* DATA_TYPE_FP16 */ #error Data type not supported #endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h old mode 100755 new mode 100644 index dd9e1a3864..014ff4045e --- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h +++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp index 492f708a98..af1e34ef59 100644 --- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,7 @@ #include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h" +#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" @@ -43,31 +44,50 @@ GCCol2ImKernel::GCCol2ImKernel() void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output, std::pair convolved_dims) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + TensorShape output_shape = input->info()->tensor_shape(); + output_shape.set(0, convolved_dims.first); + output_shape.set(1, convolved_dims.second); + output_shape.set(2, input->info()->tensor_shape()[0]); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); _input = input; _output = output; _convolved_dims = convolved_dims; + unsigned int num_elems_processed_per_iteration = 1; + // Create kernel - std::set build_opts; - constexpr unsigned int num_elems_processed_per_iteration = 8; + std::set build_opts; + build_opts.emplace("#define WIDTH_OUTPUT " + support::cpp11::to_string(_convolved_dims.first)); + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); - build_opts.insert("#define COL2IM"); - _kernel = static_cast(GCKernelLibrary::get().create_kernel("col2im", build_opts)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(num_elems_processed_per_iteration)); - // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); - _kernel.set_argument(idx++, _convolved_dims.first); + _kernel = static_cast(GCKernelLibrary::get().create_kernel("col2im", build_opts)); // Configure window - Window win = calculate_max_window(*input->info(), Steps()); + unsigned int nums = 2; + Window win = calculate_max_window(*output->info(), Steps(nums)); - // The GCCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + AccessWindowHorizontal output_access(output->info(), 0, 2); + const int input_padding = ceil_to_multiple(input->info()->dimension(0), 2) - input->info()->dimension(0); + + AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + input_padding, input->info()->dimension(1) + 1); + + update_window_and_padding(win, input_access, + output_access); + + output_access.set_valid_region(win, output->info()->valid_region()); IGCKernel::configure(win); } @@ -77,20 +97,25 @@ void GCCol2ImKernel::run(const Window &window) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); - Window slice_in = window.first_slice_window_2D(); - Window slice_out = window.first_slice_window_3D(); - _kernel.use(); + Window collapsed_window = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); + Window slice = collapsed_window.first_slice_window_3D(); + + // Set static kernel arguments + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + //_kernel.set_argument(idx++, _output->info()->strides_in_bytes()[3]); + _kernel.set_argument(idx++, uint(_output->info()->dimension(2))); + _kernel.set_argument(idx++, _input->info()->strides_in_bytes()[2]); + do { // Set inputs - unsigned int idx = 0; - unsigned int binding = 1; - add_2D_tensor_argument(idx, _input, binding++, slice_in); - add_3D_tensor_argument(idx, _output, binding++, slice_out); + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, 1, slice); + add_3D_tensor_argument(idx, _output, 2, slice); _kernel.update_shader_params(); - enqueue(*this, slice_in); + enqueue(*this, slice); } - while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out)); + while(collapsed_window.slide_window_slice_3D(slice)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp index 4eceab8266..7b1848c32b 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp index 77423fd8bc..23f1c2eada 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp index cd06be2585..8886b84fc0 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp index ef47e7fd8c..dc86bfb2cc 100644 --- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,7 +44,7 @@ GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel() void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); TensorShape output_shape = input->info()->tensor_shape(); diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp index 7ee39346f8..6d856e98c3 100644 --- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp index a78446e074..5d9f9c2d3e 100644 --- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,7 +40,7 @@ using namespace arm_compute; void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); TensorShape output_shape{ input->info()->tensor_shape() }; diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp index e849891c7c..4ab6f3e89d 100644 --- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + #include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h" #include "arm_compute/core/AccessWindowStatic.h" @@ -30,6 +31,7 @@ #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "support/ToolchainSupport.h" @@ -39,20 +41,40 @@ using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} +} // namespace + GCIm2ColKernel::GCIm2ColKernel() - : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr) + : _input(nullptr), _output(nullptr), _convolved_dims(), _kernel_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr) { } -void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::pair kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_UNUSED(kernel_dims); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; + // Create kernel std::set build_opts; std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); @@ -65,48 +87,52 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p build_opts.emplace("#define HAS_BIAS"); } - int pad_x = 0; - int pad_y = 0; int stride_x = 0; int stride_y = 0; - std::tie(pad_x, pad_y) = conv_info.pad(); + std::tie(stride_x, stride_y) = conv_info.stride(); + _kernel_dims = std::make_pair(kernel_dims.width, kernel_dims.height); const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), output->info()->tensor_shape().cbegin() + 1)) - && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0)); + && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding()); + std::string kernel_name = "im2col_generic"; if(!run_img2col_reduced) { - // this path is currently not used and not validated - build_opts.insert("#define IM2COL_GENERIC"); + if(input->info()->data_type() == DataType::F16 && _kernel_dims == std::pair(1, 1)) + { + build_opts.emplace("#define KERNEL_1x1"); + } + + build_opts.emplace("#define IM2COL_GENERIC"); _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), - kernel_dims.first, kernel_dims.second, + kernel_dims.width, kernel_dims.height, conv_info); - _num_elems_processed_per_iteration = output->info()->dimension(0); + _num_elems_processed_per_iteration = 2; - build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.first)); - build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.second)); + build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width)); + build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.height)); build_opts.emplace("#define KERNEL_DEPTH " + support::cpp11::to_string(input->info()->dimension(2))); build_opts.emplace("#define CONVOLVED_WIDTH " + support::cpp11::to_string(_convolved_dims.first)); build_opts.emplace("#define CONVOLVED_HEIGHT " + support::cpp11::to_string(_convolved_dims.second)); build_opts.emplace("#define STRIDE_X " + support::cpp11::to_string(conv_info.stride().first)); build_opts.emplace("#define STRIDE_Y " + support::cpp11::to_string(conv_info.stride().second)); - build_opts.emplace("#define PAD_X " + support::cpp11::to_string(conv_info.pad().first)); - build_opts.emplace("#define PAD_Y " + support::cpp11::to_string(conv_info.pad().second)); + build_opts.emplace("#define PAD_LEFT " + support::cpp11::to_string(conv_info.pad_left())); + build_opts.emplace("#define PAD_TOP " + support::cpp11::to_string(conv_info.pad_top())); + build_opts.emplace("#define PAD_RIGHT " + support::cpp11::to_string(conv_info.pad_right())); + build_opts.emplace("#define PAD_BOTTOM " + support::cpp11::to_string(conv_info.pad_bottom())); build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0))); build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1))); - // Create kernel - _kernel = static_cast(GCKernelLibrary::get().create_kernel("im2col_generic", build_opts)); - _run_func = &GCIm2ColKernel::run_generic; } else { - build_opts.insert("#define IM2COL_REDUCED"); + build_opts.emplace("#define IM2COL_REDUCED"); + kernel_name = "im2col_reduced"; if(input->info()->data_type() == DataType::F32) { @@ -117,42 +143,47 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p int input_width = input->info()->dimension(0); int input_height = input->info()->dimension(1); - build_opts.insert("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height)); + build_opts.emplace("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height)); if(input_width % 8 == 0) { _num_elems_processed_per_iteration = 8; - build_opts.insert("#define IM2COL_REDUCED_8X"); + build_opts.emplace("#define IM2COL_REDUCED_8X"); } else if(input_width % 4 == 0) { _num_elems_processed_per_iteration = 4; - build_opts.insert("#define IM2COL_REDUCED_4X"); + build_opts.emplace("#define IM2COL_REDUCED_4X"); } else if(input_width % 2 == 0) { _num_elems_processed_per_iteration = 2; - build_opts.insert("#define IM2COL_REDUCED_2X"); + build_opts.emplace("#define IM2COL_REDUCED_2X"); } else { _num_elems_processed_per_iteration = 2; - build_opts.insert("#define IM2COL_REDUCED_GENERIC"); + build_opts.emplace("#define IM2COL_REDUCED_GENERIC"); } } - // Create kernel - _kernel = static_cast(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts)); - _run_func = &GCIm2ColKernel::run_reduced; } + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel(kernel_name, build_opts)); + // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); if(input->info()->data_type() == DataType::F16) { // Calculate input right and bottom border - AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration); + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + int input_total_width = input->info()->padding().left + input_width + input->info()->padding().right; + int input_padding_right = ceil_to_multiple(input_total_width, _num_elems_processed_per_iteration) - input_total_width; + input_total_width = input_width + input_padding_right + input->info()->padding().right; + AccessWindowStatic input_access(input->info(), 0, 0, input_total_width, input_height); // Calculate output right and bottom border const int output_width = output->info()->dimension(0); @@ -174,6 +205,15 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p IGCKernel::configure(win); } +Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_UNUSED(kernel_dims); + ARM_COMPUTE_UNUSED(conv_info); + ARM_COMPUTE_UNUSED(has_bias); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + void GCIm2ColKernel::run(const Window &window) { ARM_COMPUTE_ERROR_ON(_run_func == nullptr); @@ -187,6 +227,7 @@ void GCIm2ColKernel::run_generic(const Window &window) // Get initial windows Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); + // Change the Z dimension's step back to 1 window_collapsed.set_dimension_step(Window::DimZ, 1); @@ -198,17 +239,18 @@ void GCIm2ColKernel::run_generic(const Window &window) slice.set(Window::DimX, Window::Dimension(0, static_cast(_convolved_dims.first), 1)); slice.set(Window::DimY, Window::Dimension(0, static_cast(_convolved_dims.second), 1)); - // Setup input slice - // The first three dimensions of the input are increased by the inner loops - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - // Setup output slice slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration)); slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1)); slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + // we need top/left pad to be included in valid region + if(_input->info()->data_type() == DataType::F16) + { + (dynamic_cast(_input->info()))->init(_input->info()->tensor_shape(), _input->info()->num_channels(), _input->info()->data_type(), _input->info()->strides_in_bytes(), 0, + _input->info()->total_size(), _input->info()->fixed_point_position()); + } + _kernel.use(); do @@ -216,8 +258,6 @@ void GCIm2ColKernel::run_generic(const Window &window) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, 1, slice_in); add_2D_tensor_argument(idx, _output, 2, slice_out); - - _kernel.set_argument(idx++, static_cast(_input->info()->dimension(2))); _kernel.set_argument(idx++, static_cast(_input->info()->strides_in_bytes()[3])); _kernel.set_argument(idx++, static_cast(_output->info()->strides_in_bytes()[3])); _kernel.update_shader_params(); diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp index b3a7a90931..bda08e4238 100644 --- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..4c08873dcf --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" + +using namespace arm_compute; +using namespace arm_compute::gles_compute; + +GCWeightsReshapeKernel::GCWeightsReshapeKernel() + : _input(nullptr), _biases(nullptr), _output(nullptr) +{ +} + +void GCWeightsReshapeKernel::configure(const IGCTensor *input, const IGCTensor *biases, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Calculate output shape + TensorShape output_shape{ input->info()->tensor_shape() }; + output_shape.collapse(3); + const size_t tmp_dim = output_shape[0]; + output_shape.set(0, output_shape[1]); + output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0)); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1)); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2)); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3])); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4])); + } + + _biases = biases; + _output = output; + _input = input; + + // Create build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace("#define " + dt_name); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define RESHAPE_TO_COLUMNS"); + if(biases != nullptr) + { + build_opts.emplace("#define HAS_BIAS"); + } + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("reshape_to_columns", build_opts)); + + // Set static arguments + unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor(); + idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0; + _kernel.set_argument(idx++, _input->info()->dimension(0)); + _kernel.set_argument(idx++, _input->info()->dimension(1)); + _kernel.set_argument(idx++, _input->info()->dimension(2)); + _kernel.set_argument(idx++, _input->info()->dimension(3)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps()); + + // The GCWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + IGCKernel::configure(win); +} + +void GCWeightsReshapeKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()->tensor_shape()); + + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_2D(); + + Window biases_window; + Window biases_slice; + + if(_biases != nullptr) + { + biases_window.use_tensor_dimensions(_biases->info()->tensor_shape()); + biases_slice = biases_window.first_slice_window_1D(); + } + + _kernel.use(); + + do + { + // Set arguments + unsigned idx = 0; + add_3D_tensor_argument(idx, _input, 1, in_slice); + add_2D_tensor_argument(idx, _output, 2, out_slice); + if(_biases != nullptr) + { + add_1D_tensor_argument(idx, _biases, 3, biases_slice); + biases_window.slide_window_slice_1D(biases_slice); + } + + _kernel.update_shader_params(); + // Run kernel + enqueue(*this, in_slice); + } + while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp new file mode 100644 index 0000000000..5689722340 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +#include +#include +#include + +using namespace arm_compute; + +GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights() + : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false) +{ +} + +void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose1xW) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output); + ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); + + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type())); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3)); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); + } + + const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type()); + const unsigned bias_element = (append_biases) ? 1 : 0; + const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr; + + _transpose1xW = transpose1xW; + + if(transpose1xW) + { + // Create tensor to store the reshaped weights + const unsigned int mat_weights_cols = weights->info()->dimension(3); + const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element; + TensorShape shape_wr(mat_weights_cols, mat_weights_rows); + const DataType dt = weights->info()->data_type(); + const int fixed_point_position = weights->info()->fixed_point_position(); + TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position); + + _weights_reshaped.allocator()->init(info_wr); + _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped); + _weights_transposed_kernel.configure(&_weights_reshaped, output); + _weights_reshaped.allocator()->allocate(); + } + else + { + _weights_reshape_kernel.configure(weights, biases_to_use, output); + } +} + +void GCConvolutionLayerReshapeWeights::run() +{ + GCScheduler::get().dispatch(_weights_reshape_kernel); + if(_transpose1xW) + { + GCScheduler::get().dispatch(_weights_transposed_kernel); + } +} + +GCConvolutionLayer::GCConvolutionLayer() + : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _fill_border(), _input_im2col_reshaped(), _input_interleaved_reshaped(), + _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false) +{ +} + +void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output, bool is_interleaved_transposed) +{ + _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed); +} + +void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); + + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3)); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); + } + + const DataType dt = input->info()->data_type(); + + _append_bias = (biases != nullptr); + _are_weights_reshaped = weights_info.are_reshaped(); + + const unsigned bias_element = (_append_bias) ? 1 : 0; + const IGCTensor *biases_to_use = (_append_bias) ? biases : nullptr; + + // Get parameters from conv_info + unsigned int stride_x = 0; + unsigned int stride_y = 0; + std::tie(stride_x, stride_y) = conv_info.stride(); + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + + const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0); + const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1); + std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height, + conv_info); + + // Check if its a "fully connected" convolution + _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1)); + const bool run_interleaved = (!_is_fully_connected_convolution); + + unsigned int mat_weights_cols = weights->info()->dimension(3); + unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element; + + // Reshape weights if needed + if(_are_weights_reshaped) + { + if(_is_fully_connected_convolution) + { + mat_weights_cols = weights->info()->dimension(0); + mat_weights_rows = weights->info()->dimension(1); + } + else + { + mat_weights_cols = weights_info.num_kernels(); + const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4; + mat_weights_rows = quarter_reshaped_cols + bias_element; + } + } + else + { + if(_is_fully_connected_convolution) + { + // Create tensor to store the reshaped weights + int num_elems_read_per_iteration_x = 1; + if(dt == DataType::F16) + { + num_elems_read_per_iteration_x = 2; + } + TensorShape shape_wr((ceil_to_multiple(mat_weights_cols, num_elems_read_per_iteration_x)), mat_weights_rows); + _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr)); + _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */); + } + else + { + // Create tensor to store transposed weights + const float transpose_width = 16.0f / input->info()->element_size(); + TensorShape shape_wt(mat_weights_rows * static_cast(transpose_width), static_cast(std::ceil(mat_weights_cols / transpose_width))); + _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt)); + _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */); + } + weights = &_weights_reshaped; + } + + // Create tensor to store im2col reshaped inputs + const unsigned int mat_input_cols = mat_weights_rows; + const unsigned int mat_input_rows = conv_w * conv_h; + TensorShape shape_im2col = input->info()->tensor_shape(); + shape_im2col.set(0, mat_input_cols); + shape_im2col.set(1, mat_input_rows); + shape_im2col.set(2, 1); + + // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. + TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position()); + _input_im2col_reshaped.allocator()->init(im2col_reshaped_info); + + // Create tensor (interleave) to prepare input tensor for GEMM + if(run_interleaved) + { + TensorShape shape_interleaved = shape_im2col; + shape_interleaved.set(0, shape_interleaved.x() * 4); + shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f)); + + // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. + TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position()); + _input_interleaved_reshaped.allocator()->init(interleaved_info); + } + + // Create GEMM output tensor + TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, mat_input_rows); + const DataType gemm_data_type = dt; + + // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. + TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position()); + _gemm_output.allocator()->init(info_gemm); + + // Configure kernels + if(dt == DataType::F16) + { + BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); + input->info()->extend_padding(border_size); + _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border + } + _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias); + + // Configure matrix multiply + if(run_interleaved) + { + _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped); + configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output); + _input_interleaved_reshaped.allocator()->allocate(); + } + else + { + configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false); + } + _input_im2col_reshaped.allocator()->allocate(); + + // Configure Col2Im + _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h)); + _gemm_output.allocator()->allocate(); + + ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one"); + + // Allocate intermediate tensor + if(!_are_weights_reshaped) + { + _weights_reshaped.allocator()->allocate(); + } +} + +void GCConvolutionLayer::run() +{ + // Run weights reshaping (Runs once for every configure) + if(!_are_weights_reshaped) + { + _are_weights_reshaped = true; + _reshape_weights.run(); + } + + // Run im2col + GCScheduler::get().dispatch(_fill_border); + GCScheduler::get().memory_barrier(); + GCScheduler::get().dispatch(_input_im2col_kernel); + + if(!_is_fully_connected_convolution) + { + GCScheduler::get().memory_barrier(); + // Run interleave4x4 + GCScheduler::get().dispatch(_input_interleave_kernel); + } + + GCScheduler::get().memory_barrier(); + // Runs matrix multiply on reshaped matrices + GCScheduler::get().dispatch(_mm_kernel); + + GCScheduler::get().memory_barrier(); + // Reshape output matrix + GCScheduler::get().dispatch(_output_col2im_kernel, false); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp index 041622d255..9e4f0f6c95 100644 --- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp +++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,7 @@ void GCFullyConnectedLayer::configure_conv_fc(const IGCTensor *input, const IGCT _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt)); // Configure im2col kernel - _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false); + _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); // Configure matrix multiply kernel _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false); diff --git a/tests/benchmark/GLES_COMPUTE/ConvolutionLayer.cpp b/tests/benchmark/GLES_COMPUTE/ConvolutionLayer.cpp new file mode 100644 index 0000000000..0d8edb757d --- /dev/null +++ b/tests/benchmark/GLES_COMPUTE/ConvolutionLayer.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h" +#include "tests/GLES_COMPUTE/GCAccessor.h" +#include "tests/benchmark/fixtures/ConvolutionLayerFixture.h" +#include "tests/datasets/system_tests/alexnet/AlexNetConvolutionLayerDataset.h" +#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h" +#include "tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ConvolutionLayerDataset.h" +#include "tests/datasets/system_tests/lenet5/LeNet5ConvolutionLayerDataset.h" +#include "tests/datasets/system_tests/squeezenet/SqueezeNetConvolutionLayerDataset.h" +#include "tests/datasets/system_tests/vgg/vgg16/VGG16ConvolutionLayerDataset.h" +#include "tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace test +{ +namespace +{ +const auto data_types = framework::dataset::make("DataType", { DataType::F16 }); +} // namespace + +using GCConvolutionLayerFixture = ConvolutionLayerFixture; + +TEST_SUITE(GC) + +REGISTER_FIXTURE_DATA_TEST_CASE(AlexNetConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::AlexNetConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +REGISTER_FIXTURE_DATA_TEST_CASE(LeNet5ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::LeNet5ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV1ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +REGISTER_FIXTURE_DATA_TEST_CASE(SqueezeNetConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::SqueezeNetConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +TEST_SUITE(NIGHTLY) +REGISTER_FIXTURE_DATA_TEST_CASE(AlexNetConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::AlexNetConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); + +REGISTER_FIXTURE_DATA_TEST_CASE(LeNet5ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::LeNet5ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV1ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); + +REGISTER_FIXTURE_DATA_TEST_CASE(SqueezeNetConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::SqueezeNetConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); + +// 8 batches use about 1.8GB of memory which is too much for most devices! +REGISTER_FIXTURE_DATA_TEST_CASE(VGG16ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::VGG16ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 1, 4 }))); + +REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2ConvolutionLayer, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2ConvolutionLayerDataset(), + data_types), + framework::dataset::make("Batches", { 1, 4, 8 }))); +TEST_SUITE_END() +TEST_SUITE_END() +} // namespace test +} // namespace arm_compute diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h index b494bf439a..191452c950 100644 --- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h +++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp new file mode 100644 index 0000000000..a5d1b6992f --- /dev/null +++ b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h" +#include "tests/GLES_COMPUTE/GCAccessor.h" +#include "tests/PaddingCalculator.h" +#include "tests/datasets/LargeConvolutionLayerDataset.h" +#include "tests/datasets/SmallConvolutionLayerDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/ConvolutionLayerFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +RelativeTolerance tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ +constexpr float tolerance_num = 0.07f; /**< Tolerance number */ + +/** CNN data types */ +const auto CNNDataTypes = framework::dataset::make("DataType", +{ + DataType::F16, + // DataType::F32, +}); +} // namespace + +TEST_SUITE(GC) +TEST_SUITE(ConvolutionLayer) + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()), CNNDataTypes), + input_shape, weights_shape, bias_shape, output_shape, info, data_type) +{ + // Set fixed point position data type allowed + int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0; + + auto bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type; + + // Create tensors + GCTensor src = create_tensor(input_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127)); + GCTensor weights = create_tensor(weights_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127)); + GCTensor bias = create_tensor(bias_shape, bias_data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127)); + GCTensor dst = create_tensor(output_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127)); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + const QuantizationInfo src_quantization_info = src.info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights.info()->quantization_info(); + + // Create and configure function + GCConvolutionLayer conv; + conv.configure(&src, &weights, &bias, &dst, info); + + // Validate valid region + const ValidRegion src_valid_region = shape_to_valid_region(input_shape); + const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape); + const ValidRegion bias_valid_region = shape_to_valid_region(bias_shape); + const ValidRegion dst_valid_region = shape_to_valid_region(output_shape); + + validate(src.info()->valid_region(), src_valid_region); + validate(weights.info()->valid_region(), weights_valid_region); + validate(bias.info()->valid_region(), bias_valid_region); + validate(dst.info()->valid_region(), dst_valid_region); + + // Validate QuantizationInfo + ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS); + + //Validate padding + //TODO(COMPMID-415) Need to validate padding? +} + +template +using GCConvolutionLayerFixture = ConvolutionValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true, false })), + framework::dataset::make("DataType", + DataType::F16))) +{ + // Validate output + validate(GCAccessor(_target), _reference, tolerance_f16, tolerance_num); +} +FIXTURE_DATA_TEST_CASE(RunLarge, GCConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true, false })), + framework::dataset::make("DataType", + DataType::F16))) +{ + // Validate output + validate(GCAccessor(_target), _reference, tolerance_f16, tolerance_num); +} +TEST_SUITE_END() + +TEST_SUITE_END() +TEST_SUITE_END() +} +} // namespace validation +} // namespace test +} // namespace arm_compute -- cgit v1.2.1