From 358ca205c9e41f523517ffa55a9057308b736040 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 7 Dec 2017 16:47:52 +0000 Subject: COMPMID-617: Adds CLFullyConnectionLayer validation support Change-Id: I4d2eb9872a3165fdcaa7784596e441cbe563dbc2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112577 Tested-by: Jenkins Reviewed-by: Ioan-Cristian Szabo Reviewed-by: Anthony Barbier --- .../core/CL/kernels/CLGEMMInterleave4x4Kernel.h | 8 + .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h | 10 ++ .../kernels/CLGEMMLowpOffsetContributionKernel.h | 13 ++ .../core/CL/kernels/CLGEMMLowpReductionKernel.h | 16 ++ .../kernels/CLGEMMMatrixAccumulateBiasesKernel.h | 9 ++ .../core/CL/kernels/CLGEMMMatrixMultiplyKernel.h | 12 ++ .../core/CL/kernels/CLGEMMTranspose1xWKernel.h | 8 + arm_compute/core/CL/kernels/CLIm2ColKernel.h | 15 +- arm_compute/core/utils/misc/ShapeCalculator.h | 104 +++++++++++++ .../runtime/CL/functions/CLFullyConnectedLayer.h | 20 +++ .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h | 11 ++ .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 10 +- src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp | 89 +++++++---- .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp | 135 +++++++++++------ .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 134 +++++++++++------ src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp | 104 +++++++++---- .../kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp | 65 +++++--- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 166 +++++++++++++-------- src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp | 97 ++++++++---- src/core/CL/kernels/CLIm2ColKernel.cpp | 34 ++++- .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 8 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 129 +++++++++++++++- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 104 ++++++++----- tests/validation/CL/FullyConnectedLayer.cpp | 51 +++++++ tests/validation/NEON/Col2Im.cpp | 4 +- tests/validation/NEON/Im2Col.cpp | 4 +- 26 files changed, 1058 insertions(+), 302 deletions(-) create mode 100644 arm_compute/core/utils/misc/ShapeCalculator.h diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h index c87fb2cd66..2520eff5de 100644 --- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h @@ -68,6 +68,14 @@ public: * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h index b60b80618c..3ad3ced003 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h @@ -61,6 +61,16 @@ public: * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel */ void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel + * + * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8 + * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0 + * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32 + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel + * + * @return a status + */ + static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed = true); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h index 5f2e025687..871b97c1d7 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h @@ -68,6 +68,19 @@ public: * @param[in] b_offset Offset to be added to each element of the matrix B. */ void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel + * + * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h index aa0583fe81..12c12ef99a 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h @@ -71,6 +71,14 @@ public: * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 */ void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override; + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel + * + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -90,6 +98,14 @@ public: * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 */ void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override; + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel + * + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h index 9348ff8ca8..2956f93cdc 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h @@ -50,6 +50,15 @@ public: * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input */ void configure(ICLTensor *accum, const ICLTensor *biases); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel + * + * @param[in] accum The accumulate tensor to convert. Data types supported: QS8/QS16/F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input + * @param[in] gpu_target GPU target + * + * @return a status + */ + static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h index 5af9091416..4e73d7eb13 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h @@ -60,6 +60,18 @@ public: * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel */ void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel + * + * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0 + * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 + * @param[in] alpha Weight of the matrix product + * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel + * @param[in] gpu_target GPU Target + * + * @return a status + */ + static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h index 8a37720462..8721643c1e 100644 --- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h @@ -74,6 +74,14 @@ public: * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel + * + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor. Data type supported: same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h index 1d8b5500c1..88de1ba002 100644 --- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h @@ -69,7 +69,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32 * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, * while every dimension above represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -80,6 +80,19 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; + /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, + * while every dimension above represents a batch. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); private: /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input) diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h new file mode 100644 index 0000000000..52773faa3a --- /dev/null +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H__ +#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H__ + +#include "arm_compute/core/ITensorInfo.h" + +namespace arm_compute +{ +namespace misc +{ +namespace shape_calculator +{ +inline TensorShape compute_interleaved_shape(const ITensorInfo &a) +{ + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_interleaved_a{ a.tensor_shape() }; + shape_interleaved_a.set(0, a.dimension(0) * 4); + shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f)); + + return shape_interleaved_a; +} +inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b) +{ + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_transposed1xW_b{ b.tensor_shape() }; + shape_transposed1xW_b.set(0, b.dimension(1) * 16); + shape_transposed1xW_b.set(1, std::ceil(b.dimension(0) / 16.f)); + + return shape_transposed1xW_b; +} +inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b) +{ + // The transpose1xW output matrix will have the following shape: + // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ] + TensorShape shape_transposed1xW_b{ b.tensor_shape() }; + const size_t transpose_width = 16 / b.element_size(); + shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width); + shape_transposed1xW_b.set(1, static_cast(std::ceil(b.dimension(0) / static_cast(transpose_width)))); + + return shape_transposed1xW_b; +} +inline TensorShape compute_reductionA_shape(const ITensorInfo &b) +{ + TensorShape shape_vector_sum_col{ b.tensor_shape() }; + if(shape_vector_sum_col.num_dimensions() > 1) + { + shape_vector_sum_col.remove_dimension(1); + } + + return shape_vector_sum_col; +} +inline TensorShape compute_reductionB_shape(const ITensorInfo &a) +{ + TensorShape shape_vector_sum_row{ a.tensor_shape() }; + shape_vector_sum_row.set(Window::DimX, a.dimension(1)); + if(a.num_dimensions() > 1) + { + shape_vector_sum_row.remove_dimension(1); + } + + return shape_vector_sum_row; +} +inline TensorShape compute_im2col_shape(const ITensorInfo &input) +{ + TensorShape shape_im2col{ input.tensor_shape() }; + shape_im2col.collapse(3); + + return shape_im2col; +} +inline TensorShape compute_transposed_shape(const ITensorInfo &input) +{ + TensorShape shape_transposed{ input.tensor_shape() }; + + shape_transposed.set(0, input.dimension(1)); + shape_transposed.set(1, input.dimension(0)); + + return shape_transposed; +} +} // namespace shape_calculator +} // namespace misc +} // namespace arm_compute +#endif /* __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 2cac06c1c9..1e9ee492ad 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -52,6 +52,14 @@ public: * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayerReshapeWeights + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); }; /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: @@ -78,6 +86,18 @@ public: * @param[in] are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false. */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false); + /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer + * + * @param[in] input Source tensor. Data type supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input + * @param[in] biases Bias tensor. It can be nullptr. Data type supported:Same as @p input. + * @param[in] output Destination tensor. Data type supported: Same as @p input. + * @param[in] transpose_weights (Optional) Transpose weights if true. Defaults to true. + * @param[in] are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights = true, bool are_weights_reshaped = false); //Inherited methods override void run() override; diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h index e316144548..3976704907 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -69,6 +69,17 @@ public: * if the reshape of matrix B should be executed only for the first run */ void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore + * + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] output Output tensor. Data type supported: Data type supported: S32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + * + * @return a status + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 46e6b494f8..eddb3a26b7 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -75,11 +75,11 @@ public: void configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore * - * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[out] output Output tensor. Data type supported: Data type supported: S32 - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should be executed only for the first run + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] output Output tensor. Data type supported: Data type supported: S32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run * * @return a status */ diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp index 7741f12900..6886f54602 100644 --- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -33,8 +33,55 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, input->valid_region()); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() : _input(nullptr), _output(nullptr) @@ -43,22 +90,13 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, - DataType::U16, DataType::S16, DataType::QS16, - DataType::U32, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - - TensorShape output_shape = input->info()->tensor_shape(); - output_shape.set(0, input->info()->dimension(0) * 4); - output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info()))); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; @@ -68,20 +106,9 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name)); // Configure kernel window - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type()); - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure(win); + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); // Set config_id for enabling LWS tuning _config_id = "interleave4x4_"; @@ -92,6 +119,14 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _config_id += support::cpp11::to_string(output->info()->dimension(1)); } +Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + + return Status{}; +} + void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp index 1d9fe4bc01..423592b79c 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -46,79 +46,114 @@ namespace arm_compute class Coordinates; } // namespace arm_compute -CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr) +namespace { -} +using ElementsProcessed = Steps; -void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed) +Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); if(!is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); } - _input0 = input0; - _input1 = input1; - _output = output; + return Status{}; +} - CLBuildOptions build_opts; +std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed, + ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + + Window win{}; + bool window_changed = false; + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication if(is_interleaved_transposed) { - // Create kernel and set static arguments - build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options())); - // Configure window - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; + num_elems_processed_per_iteration_x = 16; + num_elems_processed_per_iteration_y = 4; constexpr unsigned int num_elems_read_per_iteration_input0 = 4; constexpr unsigned int num_elems_read_per_iteration_input1 = 16; - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1); - AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - update_window_and_padding(win, input0_access, input1_access, output_access); + AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration_input0, 1); + AccessWindowRectangle input1_access(input1, 0, 0, num_elems_read_per_iteration_input1, 1); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); - ICLKernel::configure(win); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); } else { // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); - - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)); - - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options())); + num_elems_processed_per_iteration_x = 16; + num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); // Configure window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); - AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1)); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, input0_access, input1_access, output_access); + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + coord.set_num_dimensions(output->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); + + _input0 = input0; + _input1 = input1; + _output = output; + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); - ICLKernel::configure(win); + // Create build options + CLBuildOptions build_opts; + std::string kernel_name(" "); + if(is_interleaved_transposed) + { + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + kernel_name = "gemmlowp_mm_interleaved_transposed"; } + else + { + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x())); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y())); + kernel_name = "gemmlowp_mm"; + } + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); // Set config_id for enabling LWS tuning _config_id = "gemmlowp_"; @@ -132,6 +167,20 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } +Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), + input1->clone().get(), + output->clone().get(), + is_interleaved_transposed, + num_elements_processed) + .first); + + return Status{}; +} + void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp index 2877a74be8..d05939fcf5 100644 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp @@ -44,89 +44,135 @@ namespace arm_compute class Coordinates; } // namespace arm_compute -CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) +namespace { -} - -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); // If a_offset == 0, vector_sum_col can be a nullptr if(a_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); - - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr if(b_offset != 0) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1)); - // Validate batches - TensorShape output_shape = mm_result->info()->tensor_shape(); + TensorShape output_shape = mm_result->tensor_shape(); if(output_shape.num_dimensions() > 1) { - TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); vector_sum_row_shape.collapse_from(1); output_shape.collapse_from(2); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) { - TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); } } - - build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset)); } - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; + return Status{}; +} +std::pair validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ constexpr unsigned int num_elems_processed_per_iteration = 16; + bool window_changed = false; // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, mm_result_access); + AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + mm_result_access); if(a_offset != 0) { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, vector_sum_col_access); + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, + vector_sum_col_access); } - if(b_offset != 0) { - AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); - update_window_and_padding(win, vector_sum_row_access); + AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT + window_changed = window_changed || update_window_and_padding(win, + vector_sum_row_access); } - ICLKernel::configure(win); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) +{ +} + +void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + a_offset, b_offset)); // NOLINT + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(mm_result->info(), + vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, + a_offset, b_offset); // NOLINT + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); +} + +Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(), + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + a_offset, b_offset) + .first); // NOLINT + + return Status{}; } void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp index bcf04b0982..6951512167 100644 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp @@ -44,6 +44,59 @@ namespace arm_compute class Coordinates; } // namespace arm_compute +namespace +{ +Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Status{}; +} +std::pair validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output) +{ + const unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + return Status{}; +} + +std::pair validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1)); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() : _input(), _output() { @@ -51,8 +104,9 @@ ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); _input = mtx_a; _output = vector_sum_row; @@ -64,21 +118,18 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTens // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options())); - const unsigned int num_elems_processed_per_iteration = 1; - // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get()).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue) @@ -107,8 +158,8 @@ void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueu void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); _input = mtx_b; _output = vector_sum_col; @@ -121,21 +172,18 @@ void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTens // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options())); - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), num_elems_processed_per_iteration), _input->info()->dimension(1)); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - input_access, - output_access); + auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); +} - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); +Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp index 015b4f70a4..d5c93dd24a 100644 --- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -36,6 +36,37 @@ using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1); + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target, + unsigned int &num_elems_processed_per_iteration) +{ + // Select the vector size to use (8 for Bifrost; 16 for Midgard). + num_elems_processed_per_iteration = (gpu_target == GPUTarget::BIFROST) ? 8 : 16; + + // Configure kernel window + Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1)); + AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, biases_access, accum_access); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() : _accum(nullptr), _biases(nullptr) { @@ -43,18 +74,21 @@ CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum); - ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); _biases = biases; _accum = accum; // Get the target architecture - GPUTarget arch_target = get_arch_from_target(get_target()); - // Select the vector size to use (8 for Bifrost; 16 for Midgard). - const unsigned int vector_size = (arch_target == GPUTarget::BIFROST) ? 8 : 16; + GPUTarget arch_target = get_arch_from_target(get_target()); + unsigned int vector_size = 0; + + // Configure kernel window + auto win_config = validate_and_configure_window(accum->info(), biases->info(), arch_target, vector_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); // Add build options CLBuildOptions build_opts; @@ -65,18 +99,15 @@ void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTe // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts.options())); +} - // Configure kernel window - const unsigned int num_elems_processed_per_iteration = vector_size; - - Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1)); - AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, biases_access, accum_access); +Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target) +{ + unsigned int num_elems_processed_per_iteration = 0; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 16706dd748..f51d0f92d4 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -42,6 +42,81 @@ using namespace arm_compute; +namespace +{ +using ElementsProcessed = Steps; + +inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + if(!is_interleaved_transposed) + { + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); + } + + return Status{}; +} + +inline std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, + bool is_interleaved_transposed, GPUTarget gpu_target, + ElementsProcessed &num_elements_processed) +{ + bool window_changed = false; + Window win{}; + + const DataType data_type = input0->data_type(); + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + + if(is_interleaved_transposed) + { + // Configure kernel window + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + } + else // The input tensors have not been reshaped + { + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case. + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); + + // Create kernels according to the architecture, data type and input size. + if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32) + { + num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000) ? 2 : 4; + } + + // Configure window + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1)); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, input0_access, input1_access, output_access); + + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() : _input0(nullptr), _input1(nullptr), _output(nullptr) { @@ -49,13 +124,10 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - if(!is_interleaved_transposed) - { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); - } + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); _input0 = input0; _input1 = input1; @@ -82,14 +154,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _lws_hint = cl::NDRange(8, 8); } + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + // Create build options CLBuildOptions build_opts; build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos)); - const bool multiply_alpha = std::abs(1.0f - alpha) > 0.00001f; - // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications. - if(multiply_alpha) + if(std::abs(1.0f - alpha) > 0.00001f) { build_opts.add_option_if_else(is_data_type_fixed_point(data_type), "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))), @@ -108,49 +185,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen { kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); } - - // Configure kernel window - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); - AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, input0_access, input1_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - - ICLKernel::configure(win); } else // The input tensors have not been reshaped { build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); - // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case. - unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); - const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); - // Create kernels according to the architecture, data type and input size. if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32) { // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g. // FC6 and FC7 of AlexNet and VGG-16). - if(input1->info()->dimension(0) <= 1000) - { - // Each work-item processes 2 elements in the X dimension. - num_elems_processed_per_iteration_x = 2; - kernel_name = "gemm_mm_floating_point_f32_bifrost_1000"; - } - else - { - // Each work-item processes 4 elements in the X dimension (as in the default case). - num_elems_processed_per_iteration_x = 4; - kernel_name = "gemm_mm_floating_point_f32_bifrost"; - } + kernel_name = (input1->info()->dimension(0) <= 1000) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost"; + // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels // via exhaustive autotuning over a range of representative layer configurations. _lws_hint = cl::NDRange(4); @@ -164,23 +211,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); kernel_name = "gemm_mm_floating_point"; } - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)); - build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)); - - // Configure window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); - AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, input0_access, input1_access, output_access); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); - - ICLKernel::configure(win); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y())); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x())); } // Create kernel @@ -198,6 +230,22 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } +Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), + input1->clone().get(), + output->clone().get(), + is_interleaved_transposed, + gpu_target, + num_elements_processed) + .first); + + return Status{}; +} + void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 35074f94cf..69a545b76b 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -33,36 +33,82 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; -void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, - DataType::U16, DataType::S16, DataType::QS16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_transpose1xW_with_element_size_shape(*input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } - TensorShape output_shape{ input->info()->tensor_shape() }; - const size_t transpose_w = 16 / input->info()->element_size(); - output_shape.set(0, input->info()->dimension(1) * transpose_w); - output_shape.set(1, static_cast(std::ceil((input->info()->dimension(0) / static_cast(transpose_w))))); + return Status{}; +} - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) +{ + num_elems_processed_per_iteration = 16 / input->element_size(); + + const int scale_x = num_elems_processed_per_iteration; + bool window_changed = false; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + + if((win.x().end() / scale_x) == 0) + { + return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win); + } + + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access); + + // Configure window in case of configured output + if(output->total_size() != 0) + { + AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + window_changed = window_changed || update_window_and_padding(win, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape())); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); +void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info()))); - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); - const int scale_x = num_elems_processed_per_iteration; + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; + // Configure kernel window + unsigned int num_elems_processed_per_iteration = 1; + auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + /* * Following an example of how the transposition1xW works when the input data type is F32 * @@ -76,20 +122,15 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp // Create kernel std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name)); +} - // Configure window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - - ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension"); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape())); +Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + unsigned int num_elems_processed_per_iteration = 1; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first); - ICLKernel::configure(win); + return Status{}; } void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 6514d6cf91..0e9f2c5344 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -39,6 +39,24 @@ using namespace arm_compute; +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + return Status{}; +} +} // namespace + CLIm2ColKernel::CLIm2ColKernel() : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr) { @@ -46,9 +64,10 @@ CLIm2ColKernel::CLIm2ColKernel() void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); _input = input; _output = output; @@ -184,6 +203,15 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _config_id += support::cpp11::to_string(output->info()->dimension(1)); } +Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_UNUSED(kernel_dims); + ARM_COMPUTE_UNUSED(conv_info); + ARM_COMPUTE_UNUSED(has_bias); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON(_run_func == nullptr); diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp index f6964002dd..3d41548a6a 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp @@ -71,16 +71,16 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto vector_sum_row_shape.collapse_from(1); output_shape.collapse_from(2); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], + "mm_result tensor must have the same number of batches of output tensor"); if(a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 - && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); } } } diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 7fd81cdb94..68c6576a79 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "support/ToolchainSupport.h" @@ -32,6 +33,34 @@ #include using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed) +{ + const GPUTarget gpu_target = CLScheduler::get().target(); + + if(is_data_type_quantized_asymmetric(input.data_type())) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset); + const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), + &output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target)); + } + + return Status{}; +} +} // namespace void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) { @@ -40,6 +69,11 @@ void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLT _kernel = std::move(k); } +Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return CLTransposeKernel::validate(input, output); +} + CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(), _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false) @@ -80,8 +114,7 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT // If the fully connected layer is called after a convolution layer, the input tensor must be linearized // Initialize output tensor for im2col - TensorShape shape_im2col = input->info()->tensor_shape(); - shape_im2col.collapse(3); + TensorShape shape_im2col = compute_im2col_shape(*input->info()); _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col)); // Configure im2col kernel @@ -105,9 +138,15 @@ void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTen void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), + weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), + transpose_weights, + are_weights_reshaped)); _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true; _is_fc_after_conv = true; @@ -192,6 +231,86 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w } } +Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = transpose_weights ? are_weights_reshaped : true; + bool is_fc_after_conv = true; + bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + const GPUTarget gpu_target = CLScheduler::get().target(); + + const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(*input))); + const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + + // Configure accumulate biases kernel for non quantized asymmetric types + if(biases != nullptr && !is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; + + if(!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + + if(is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3, + input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if(is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate im2col kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false)); + input_to_use = &im2col_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false)); + + // Validate output stage for asymmetric quantized types + if(is_quantized) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output)); + } + + return Status{}; +} + void CLFullyConnectedLayer::run() { // Reshape of the weights (happens only once) diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 5c6f5b4ed0..ddcab6a256 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -29,9 +29,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), @@ -41,14 +43,9 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_UNUSED(gemm_info); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info)); _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->info()->quantization_info().offset; @@ -65,18 +62,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor matrix_a = &_tmp_a; matrix_b = &_tmp_b; - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->info()->tensor_shape(); - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->info()->tensor_shape(); - shape_tmp_b.set(0, b->info()->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); + TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type()); + TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type()); _tmp_a.allocator()->init(info_a); _tmp_b.allocator()->init(info_b); _memory_group.manage(&_tmp_a); @@ -95,13 +82,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0) { - TensorShape shape_vector_sum_col = b->info()->tensor_shape(); - - if(shape_vector_sum_col.num_dimensions() > 1) - { - shape_vector_sum_col.remove_dimension(1); - } - TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32); + TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); _vector_sum_col.allocator()->init(info_vector_sum_col); _memory_group.manage(&_vector_sum_col); @@ -112,13 +93,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 if(_b_offset != 0) { - TensorShape shape_vector_sum_row = a->info()->tensor_shape(); - shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1)); - if(a->info()->num_dimensions() > 1) - { - shape_vector_sum_row.remove_dimension(1); - } - TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32); + TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32); _vector_sum_row.allocator()->init(info_vector_sum_row); _memory_group.manage(&_vector_sum_row); @@ -147,6 +122,67 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } } +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1), + "The output matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0), + "The output matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + bool is_interleaved_transposed = a->dimension(1) > 16; + + if(is_interleaved_transposed) + { + TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type()); + TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output)); + } + + TensorInfo info_vector_sum_col, info_vector_sum_row; + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if(a_offset != 0) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if(b_offset != 0) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + a_offset, b_offset)); + + return Status{}; +} + void CLGEMMLowpMatrixMultiplyCore::run() { _memory_group.acquire(); diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp index 0d8c8774b8..aba92f14d8 100644 --- a/tests/validation/CL/FullyConnectedLayer.cpp +++ b/tests/validation/CL/FullyConnectedLayer.cpp @@ -115,6 +115,57 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(frame ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS); } +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::QS8, 2), // Mismatching fixed point position + TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32), + TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32), + TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32), // Invalid weights dimensions + TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32), // Wrongly reshaped weights + TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32), + }), + framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16), + TensorInfo(TensorShape(315U, 271U), 1, DataType::QS8, 3), + TensorInfo(TensorShape(192U, 192U), 1, DataType::F32), + TensorInfo(TensorShape(192U, 192U), 1, DataType::F32), + TensorInfo(TensorShape(217U, 315U), 1, DataType::F32), + TensorInfo(TensorShape(217U, 315U), 1, DataType::F32), + TensorInfo(TensorShape(192U, 192U), 1, DataType::F32), + })), + framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32), + TensorInfo(TensorShape(271U), 1, DataType::QS8, 2), + TensorInfo(TensorShape(192U), 1, DataType::F32), + TensorInfo(TensorShape(192U), 1, DataType::F32), + TensorInfo(TensorShape(271U), 1, DataType::F32), + TensorInfo(TensorShape(271U), 1, DataType::F32), + TensorInfo(TensorShape(192U), 1, DataType::F32), + })), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32), + TensorInfo(TensorShape(271U, 3U), 1, DataType::QS8, 3), + TensorInfo(TensorShape(192U, 4U), 1, DataType::F32), + TensorInfo(TensorShape(192U, 4U), 1, DataType::F32), + TensorInfo(TensorShape(271U, 3U), 1, DataType::F32), + TensorInfo(TensorShape(271U, 3U), 1, DataType::F32), + TensorInfo(TensorShape(192U, 4U), 1, DataType::F32), + })), + framework::dataset::make("TransposeWeights",{ true, true, true, false, true, true, true })), + framework::dataset::make("ReshapedWeights",{ false, false, false, false, false, false , false})), + framework::dataset::make("Expected", { false, false, true, true, false, false, true })), + input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected) +{ + Status status = CLFullyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false), + &weights_info.clone()->set_is_resizable(false), + &bias_info.clone()->set_is_resizable(false), + &output_info.clone()->set_is_resizable(false), + transpose_weights, + reshaped_weights); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + template using CLFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture; diff --git a/tests/validation/NEON/Col2Im.cpp b/tests/validation/NEON/Col2Im.cpp index 9125dc2498..9f2415d628 100644 --- a/tests/validation/NEON/Col2Im.cpp +++ b/tests/validation/NEON/Col2Im.cpp @@ -58,8 +58,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( framework::dataset::make("Expected", { false, false, false, false, true })), input_info, output_info, convolved_width, convolved_height, expected) { - bool err = bool(NECol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height))); - ARM_COMPUTE_EXPECT(err == expected, framework::LogLevel::ERRORS); + bool status = bool(NECol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height))); + ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS); } // clang-format on // *INDENT-ON* diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp index 4faa7d7d66..f8e474b6c3 100644 --- a/tests/validation/NEON/Im2Col.cpp +++ b/tests/validation/NEON/Im2Col.cpp @@ -56,8 +56,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("Expected", { false, false, false, false, true })), input_info, output_info, has_bias, expected) { - bool err = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias)); - ARM_COMPUTE_EXPECT(err == expected, framework::LogLevel::ERRORS); + bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias)); + ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS); } // clang-format on // *INDENT-ON* -- cgit v1.2.1