From 951b8a4c01de2810349b6f16cf9bbba7578484fa Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Mon, 4 Nov 2019 14:42:08 +0000 Subject: COMPMID-2309 : CLConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters Change-Id: I16f6758b768ede404a064db057302ded706e1e8a Signed-off-by: Vidhya Sudhan Loganathan Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2215 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins --- .../core/CL/kernels/CLDepthConvertLayerKernel.h | 9 ++-- ...CLGEMMLowpOffsetContributionOutputStageKernel.h | 63 +++++++++++++--------- .../core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h | 4 +- .../core/CL/kernels/CLWeightsReshapeKernel.h | 10 ++-- arm_compute/core/Types.h | 1 + arm_compute/core/utils/quantization/AsymmHelpers.h | 14 +++-- .../runtime/CL/functions/CLConvolutionLayer.h | 9 ++-- .../runtime/CL/functions/CLGEMMConvolutionLayer.h | 22 ++++---- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h | 52 +++++++++++------- 9 files changed, 112 insertions(+), 72 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h index 7475d8d41d..cce7b69a0e 100644 --- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h @@ -41,6 +41,7 @@ public: * * Valid conversions Input -> Output : * + * - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data) * - U8 -> S8, U16, S16, U32, S32, F16, F32 * - U16 -> U8, S8, S16, U32, S32, F16, F32 * - S16 -> U8, S8, U16, U32, S32, F16, F32 @@ -49,16 +50,16 @@ public: * - F16 -> U8, S8, U16, S16, U32, F32 * - F32 -> U8, S8, U16, S16, U32, F16 * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32. + * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. */ void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel * - * @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] input Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32. + * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. * diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h index de06c88d5c..301c67331e 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -51,39 +51,47 @@ public: CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8 - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info + * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: QASYMM8. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info + * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 */ void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage); + const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 or QASYMM8 if output_stage != NONE - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8 - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info + * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 or QASYMM8 if output_stage != NONE + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: QASYMM8. + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info + * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + * @param[in] output_shifts Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 * * @return a status */ static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage); + int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -94,6 +102,9 @@ private: const ICLTensor *_vector_sum_row; const ICLTensor *_bias; ICLTensor *_output; + const ICLTensor *_output_multipliers; + const ICLTensor *_output_shifts; + bool _is_quantized_per_channel; }; } // namespace arm_compute diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h index 26ab210b21..937f6a9b89 100644 --- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h @@ -48,7 +48,7 @@ public: CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: @@ -61,7 +61,7 @@ public: void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: All * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h index bdc5792641..59740b9db9 100644 --- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h +++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -69,9 +69,9 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with - * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input + * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr. * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. * @param[out] output The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise. * Data types supported: Same as @p input @@ -82,9 +82,9 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with - * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input + * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr. * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. * @param[in] output The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise. * Data types supported: Same as @p input diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 851292f1e1..38d78971ef 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1883,6 +1883,7 @@ struct GEMMLowpOutputStageInfo int gemmlowp_max_bound{ 0 }; /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */ std::vector gemmlowp_multipliers{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */ std::vector gemmlowp_shifts{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */ + bool is_quantized_per_channel{ false }; /**< GEMMLowp quantized per-channel flag */ }; /** GEMM LHS (Left Hand Side) matrix information */ diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h index 6b6cb007e3..0bf6ff5c95 100644 --- a/arm_compute/core/utils/quantization/AsymmHelpers.h +++ b/arm_compute/core/utils/quantization/AsymmHelpers.h @@ -84,15 +84,21 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty * per-channel, multipliers and shifts will end up being the same for each * channel. * - * @param[in] input Input tensor. - * @param[in] weights Weights tensor. - * @param[in] output Output tensor. + * @param[in] input Input tensor info. + * @param[in] weights Weights tensor info. + * @param[in] output Output tensor info. + * @param[in] idx_ofms Dimension index to get OFMs from the weights tensor. * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers. * @param[out] output_shifts_ptr Pointer to the buffer where to store per-channel shifts. * * @return min and max values for the quantized data type */ -void compute_quantized_multipliers_and_shifts(const ITensor *input, const ITensor *weights, const ITensor *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr); +void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + unsigned int idx_ofms, + int32_t *output_multipliers_ptr, + int32_t *output_shifts_ptr); } // namespace quantization } // namespace arm_compute #endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index 04ce1cf635..8dfb6c86c0 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -78,7 +78,8 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. @@ -98,7 +99,8 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. @@ -120,7 +122,8 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h index 017bf78938..3392f11b06 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h @@ -60,7 +60,7 @@ public: /** Set the input and output tensors. * * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/F16/F32. + * Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. Data types supported: Same as @p weights. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout @@ -69,7 +69,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights * * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/F16/F32. + * Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[in] output Destination tensor. Data types supported: Same as @p weights. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout @@ -168,7 +168,8 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. @@ -187,7 +188,8 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. @@ -212,7 +214,7 @@ private: /** Configures the appropriate matrix multiply routine * * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[in, out] output Output tensor. Data types supported: Same as @p input, @@ -225,12 +227,12 @@ private: const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines * - * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data types supported: Same as @p input, - * except for input of QASYMM8 type where output should be of S32 type. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * @param[in] input Input tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * @param[in] output Output tensor info. Data types supported: Same as @p input, + * except for input of QASYMM8 type where output should be of S32 type. * @param[in] gemmlowp_output_stage GEMMLowp output stage info * @param[in] gemm_3d_depth Depth of GEMM 3D * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h index 6aacbf6abd..b364653a36 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ #define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ +#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" @@ -49,6 +50,7 @@ class ICLTensor; * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0) * -# @ref CLGEMMLowpOffsetContributionKernel (if gemm_info.gemmlowp_output_stage == NONE) * -# @ref CLGEMMLowpOffsetContributionOutputStageKernel (if gemm_info.gemmlowp_output_stage != NONE) + * -# @ref CLDepthConvertLayerKernel * */ class CLGEMMLowpMatrixMultiplyCore : public IFunction @@ -84,10 +86,10 @@ public: void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore * - * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[in] output Output tensor. Data type supported: S32 or QASYMM8 if gemm_info.gemmlowp_output_stage != NONE + * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32 + * @param[in] output Output tensor info. Data type supported: S32 or QASYMM8 if gemm_info.gemmlowp_output_stage != NONE * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run * @@ -100,7 +102,10 @@ public: void prepare() override; private: - MemoryGroup _memory_group; + MemoryGroup _memory_group; + + // Kernels used + CLDepthConvertLayerKernel _weights_to_qasymm8; CLGEMMLowpMatrixMultiplyKernel _mm_midgard_kernel; CLGEMMLowpMatrixMultiplyNativeKernel _mm_native_kernel; CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel; @@ -109,18 +114,29 @@ private: CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel; CLGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; - CLTensor _vector_sum_col; - CLTensor _vector_sum_row; - CLTensor _tmp_b; - CLTensor _mm_result_s32; - const ICLTensor *_original_b; - int32_t _a_offset; - int32_t _b_offset; - bool _is_gemm_reshaped; - bool _is_midgard; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; + + // Temporary tensors + CLTensor _qasymm8_weights; + CLTensor _vector_sum_col; + CLTensor _vector_sum_row; + CLTensor _tmp_b; + CLTensor _mm_result_s32; + CLTensor _gemm_output_stage_multipliers; + CLTensor _gemm_output_stage_shifts; + + // Tensor pointers + const ICLTensor *_matrix_a; + const ICLTensor *_original_b; + const ICLTensor *_output; + + int32_t _a_offset; + int32_t _b_offset; + bool _is_gemm_reshaped; + bool _is_midgard; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _fuse_output_stage; + bool _convert_to_qasymm8; }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */ \ No newline at end of file -- cgit v1.2.1