From 6e1791b1bfabc81f08d3117939f6eb5264ed4edf Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 2 Dec 2019 19:01:25 +0000 Subject: COMPMID-2764: Add support for QASYMM8_SIGNED in NEConvolutionLayer. Change-Id: I8fbbd2e399f48968337a60147098d04f27c2d1c0 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2402 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- arm_compute/core/NEON/kernels/NECol2ImKernel.h | 6 +- ...NEGEMMLowpOffsetContributionOutputStageKernel.h | 7 +- .../core/NEON/kernels/NEWeightsReshapeKernel.h | 6 +- arm_compute/core/Utils.h | 83 ++++++++++ arm_compute/runtime/NEON/functions/NECol2Im.h | 6 +- .../runtime/NEON/functions/NEConvolutionLayer.h | 12 +- .../NEON/functions/NEGEMMConvolutionLayer.h | 54 +++--- src/core/NEON/kernels/NECol2ImKernel.cpp | 4 - ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 182 +++++++++++++-------- src/core/NEON/kernels/NEIm2ColKernel.cpp | 10 +- src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 4 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 70 ++++---- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 6 +- tests/validation/Helpers.cpp | 35 ++-- tests/validation/Helpers.h | 8 + tests/validation/NEON/ConvolutionLayer.cpp | 13 ++ .../validation/fixtures/ConvolutionLayerFixture.h | 11 +- tests/validation/reference/ActivationLayer.cpp | 11 ++ tests/validation/reference/Convolution3d.h | 10 +- tests/validation/reference/ConvolutionLayer.cpp | 6 +- 20 files changed, 372 insertions(+), 172 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h index f02858e7d9..9858d4fd56 100644 --- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h +++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,7 +72,7 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: Any * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. @@ -80,7 +80,7 @@ public: void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims); /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: Any * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h index dadc5c221b..ac17b2efa5 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h @@ -37,13 +37,14 @@ class ITensor; * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), * and adds to it the offset contribution of matrix A and matrix B in-place. * - * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint. + * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8. + * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8. * - * For QuantizeDownInt32ToUint8Scale the final result is: + * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is: * * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift * - * For QuantizeDownInt32ToUint8ScaleByFixedPoint the final result is: + * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is: * * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift * diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h index 585c707bb6..d432b731c2 100644 --- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h +++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h @@ -75,7 +75,8 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/FP16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. + * Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/FP16/F32 * @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. @@ -85,7 +86,8 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. + * Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 366d5dcc68..590e281bb0 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -549,6 +549,72 @@ inline DataType get_promoted_data_type(DataType dt) return DataType::UNKNOWN; } +/** Compute the mininum and maximum values a data type can take + * + * @param[in] dt Data type to get the min/max bounds of + * + * @return A tuple (min,max) with the minimum and maximum values respectively wrapped in PixelValue. + */ +inline std::tuple get_min_max(DataType dt) +{ + PixelValue min(0); + PixelValue max(0); + switch(dt) + { + case DataType::U8: + case DataType::QASYMM8: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + case DataType::S8: + case DataType::QSYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + case DataType::U16: + case DataType::QASYMM16: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + case DataType::S16: + case DataType::QSYMM16: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + case DataType::U32: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + case DataType::S32: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + case DataType::F32: + { + min = PixelValue(std::numeric_limits::lowest()); + max = PixelValue(std::numeric_limits::max()); + break; + } + default: + ARM_COMPUTE_ERROR("Undefined data type!"); + } + return std::make_tuple(min, max); +} + /** Return true if the given format has horizontal subsampling. * * @param[in] format Format to determine subsampling. @@ -1054,6 +1120,23 @@ inline bool is_data_type_quantized_asymmetric(DataType dt) } } +/** Check if a given data type is of asymmetric quantized signed type + * + * @param[in] dt Input data type. + * + * @return True if data type is of asymmetric quantized signed type, else false. + */ +inline bool is_data_type_quantized_asymmetric_signed(DataType dt) +{ + switch(dt) + { + case DataType::QASYMM8_SIGNED: + return true; + default: + return false; + } +} + /** Check if a given data type is of symmetric quantized type * * @param[in] dt Input data type. diff --git a/arm_compute/runtime/NEON/functions/NECol2Im.h b/arm_compute/runtime/NEON/functions/NECol2Im.h index 64ce9944e2..613507cf6a 100644 --- a/arm_compute/runtime/NEON/functions/NECol2Im.h +++ b/arm_compute/runtime/NEON/functions/NECol2Im.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,7 +39,7 @@ class NECol2Im : public INESimpleFunctionNoBorder public: /** Configure the col2im NEON kernel * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: Any * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. @@ -47,7 +47,7 @@ public: void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims); /** Static function to check if given info will lead to a valid configuration of @ref NECol2Im * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: Any * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index 4310ab4b41..91fcef5971 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -80,10 +80,10 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/F16/F32. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. @@ -101,10 +101,10 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/F16/F32. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. @@ -125,7 +125,7 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/F16/F32. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. @@ -149,5 +149,5 @@ private: std::shared_ptr _memory_manager; std::unique_ptr _function; /**< Function to run */ }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 6452fc9249..c513afa790 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -63,16 +63,16 @@ public: NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = default; /** Set the input and output tensors. * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. Data types supported: Same as @p weights. */ void configure(const ITensor *weights, const ITensor *biases, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. - * @param[in] output Destination tensor. Data types supported: Same as @p weights. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[in] output Destination tensor info. Data types supported: Same as @p weights. * * @return an error status */ @@ -135,8 +135,8 @@ private: * * -# @ref NEIm2ColKernel * -# @ref NEGEMM (if the data type is FP32 or FP16) - * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8) - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8) + * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) + * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED) * -# @ref NEArithmeticAdditionKernel (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout) * -# @ref NECol2ImKernel (if NCHW data layout) * @@ -158,10 +158,10 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. @@ -175,13 +175,13 @@ public: const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. - * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights @@ -202,24 +202,24 @@ public: private: /** Configures the appropriate matrix multiply routine * - * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. + * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[out] output Output tensor. Data types supported: Same as @p input, - * except for input of QASYMM8 type where output should be of S32 type. + * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) */ void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines * - * @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. - * @param[in] output Output tensor. Data types supported: Same as @p input, - * except for input of QASYMM8 type where output should be of S32 type. + * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] output Output tensor info. Data types supported: Same as @p input, + * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false) @@ -230,8 +230,8 @@ private: int gemm_3d_depth = 1, bool skip_im2col = false); /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore * - * @param[in] input_info Input tensor info. Data types supported: QASYMM8/F16/F32. - * @param[in] weights_info Weights tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] input_info Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights_info Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] gemm_3d_depth Depth of GEMM 3D * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp index e3661eef30..cea8782354 100644 --- a/src/core/NEON/kernels/NECol2ImKernel.cpp +++ b/src/core/NEON/kernels/NECol2ImKernel.cpp @@ -43,10 +43,6 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, - DataType::F16, DataType::F32); // Validate configured output if(output->total_size() != 0) diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp index a32f0bbdae..84187332f8 100644 --- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp @@ -269,6 +269,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 return out_s8; } +template +struct VectorTyper +{ + using stype = T; + using vtype = typename wrapper::traits::neon_bitvector_t; +}; + inline Window get_win_vector_sum(const Window &window) { Window win_vector_sum(window); @@ -300,9 +307,10 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) return bias_it; } -template +template inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, + const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, + typename VT::vtype min_vec, typename VT::vtype max_vec, int32_t a_offset, int32_t b_offset, int32_t k_offset, int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, int window_step_x, int window_start_x, int window_end_x) @@ -346,11 +354,13 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { - vst1q_u8(out_it.ptr() + x, finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8)); + wrapper::vstore(reinterpret_cast(out_it.ptr() + x), + finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec)); } else { - vst1q_u8(out_it.ptr() + x, finalize_quantization_floating_point(in_s32, result_shift_s32, min_u8, max_u8)); + wrapper::vstore(reinterpret_cast(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec)); } } // Compute left-over elements @@ -370,7 +380,9 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su if(is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, static_cast(min_bound), static_cast(max_bound)); + *reinterpret_cast(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, + static_cast(min_bound), + static_cast(max_bound)); } else { @@ -380,9 +392,10 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su // Bound and store the result if(is_bounded_relu) { - in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); + in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); } - *(out_it.ptr() + x) = static_cast(std::max(0, std::min(255, in_value))); + *reinterpret_cast(out_it.ptr() + x) = static_cast(std::max(static_cast(std::numeric_limits::lowest()), + std::min(static_cast(std::numeric_limits::max()), in_value))); } } } @@ -463,12 +476,15 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect } } -template +template void run_offset_contribution_output_stage(const Window &window, const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, GEMMLowpOutputStageInfo output_stage) { + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + using Typer = VectorTyper; + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; @@ -478,10 +494,10 @@ void run_offset_contribution_output_stage(const Window &window, const int32_t min_bound = output_stage.gemmlowp_min_bound; const int32_t max_bound = output_stage.gemmlowp_max_bound; - const int32x4_t result_offset_s32 = vdupq_n_s32(offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); - const uint8x16_t min_u8 = vdupq_n_u8(static_cast(min_bound)); - const uint8x16_t max_u8 = vdupq_n_u8(static_cast(max_bound)); + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); + const auto min_vec = wrapper::vdup_n(static_cast(min_bound), ExactTagType{}); + const auto max_vec = wrapper::vdup_n(static_cast(max_bound), ExactTagType{}); const int window_step_x = 16; const auto window_start_x = static_cast(window.x().start()); @@ -517,11 +533,13 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), + mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -533,10 +551,11 @@ void run_offset_contribution_output_stage(const Window &window, const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } @@ -557,10 +576,12 @@ void run_offset_contribution_output_stage(const Window &window, const int batch_id = id.z() / depth_input; const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, bias_it, mm_result_it, out_it); } @@ -571,10 +592,11 @@ void run_offset_contribution_output_stage(const Window &window, const int batch_id = id.z() / depth_input; const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_row_it, mm_result_it, out_it); } @@ -595,10 +617,12 @@ void run_offset_contribution_output_stage(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, bias_it, mm_result_it, out_it); } @@ -608,10 +632,11 @@ void run_offset_contribution_output_stage(const Window &window, { const int batch_id = id.z() / depth_input; const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, vector_sum_col_it, mm_result_it, out_it); } @@ -623,10 +648,11 @@ void run_offset_contribution_output_stage(const Window &window, Iterator bias_it = get_bias_it(collapsed_window, bias); execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, bias_it, mm_result_it, out_it); } @@ -634,10 +660,11 @@ void run_offset_contribution_output_stage(const Window &window, { execute_window_loop(collapsed_window, [&](const Coordinates &) { - run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x); + run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x); }, mm_result_it, out_it); } @@ -844,24 +871,36 @@ std::pair validate_and_configure_window(ITensorInfo *mm_result, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage) { - static std::map map_function = - { - { 0, &run_offset_contribution_output_stage }, - { 1, &run_offset_contribution_output_stage }, - { 2, &run_offset_contribution_output_stage }, - { 3, &run_offset_contribution_output_stage }, - { 4, &run_offset_contribution_output_stage }, - { 5, &run_offset_contribution_output_stage }, - { 6, &run_offset_contribution_output_stage }, - { 7, &run_offset_contribution_output_stage }, - { 8, &run_offset_contribution_output_stage_symm }, - { 9, &run_offset_contribution_output_stage_symm }, - { 10, &run_offset_contribution_output_stage_symm }, - { 11, &run_offset_contribution_output_stage_symm }, - { 12, &run_offset_contribution_output_stage_symm }, - { 13, &run_offset_contribution_output_stage_symm }, - { 14, &run_offset_contribution_output_stage_symm }, - { 15, &run_offset_contribution_output_stage_symm } + static std::map map_function_qasymm = + { + { 0, &run_offset_contribution_output_stage }, + { 1, &run_offset_contribution_output_stage }, + { 2, &run_offset_contribution_output_stage }, + { 3, &run_offset_contribution_output_stage }, + { 4, &run_offset_contribution_output_stage }, + { 5, &run_offset_contribution_output_stage }, + { 6, &run_offset_contribution_output_stage }, + { 7, &run_offset_contribution_output_stage }, + { 8, &run_offset_contribution_output_stage }, + { 9, &run_offset_contribution_output_stage }, + { 10, &run_offset_contribution_output_stage }, + { 11, &run_offset_contribution_output_stage }, + { 12, &run_offset_contribution_output_stage }, + { 13, &run_offset_contribution_output_stage }, + { 14, &run_offset_contribution_output_stage }, + { 15, &run_offset_contribution_output_stage }, + }; + + static std::map map_function_qsymm = + { + { 0, &run_offset_contribution_output_stage_symm }, + { 1, &run_offset_contribution_output_stage_symm }, + { 2, &run_offset_contribution_output_stage_symm }, + { 3, &run_offset_contribution_output_stage_symm }, + { 4, &run_offset_contribution_output_stage_symm }, + { 5, &run_offset_contribution_output_stage_symm }, + { 6, &run_offset_contribution_output_stage_symm }, + { 7, &run_offset_contribution_output_stage_symm } }; // Check if input is a 3D reinterpretation @@ -877,12 +916,23 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; // Check if symmetric per-channel execution - const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED; + const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED; + + // Check if symmetric per-channel execution + const bool is_symm = output_stage.is_quantized_per_channel; // key acts as a bitset, setting the first bit on reinterpret_as_3d, // the second on is_bounded_relu, and the third on is_fixed_point. - uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3); - return map_function.find(key)->second; + uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2); + if(is_symm) + { + return map_function_qsymm.find(key)->second; + } + else + { + key |= ((is_signed ? 1UL : 0UL) << 3); + return map_function_qasymm.find(key)->second; + } } } // namespace diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 0641d6cfa3..f57b94d70b 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -49,8 +49,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c bool has_bias, const Size2D &dilation, unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on NEON"); @@ -382,6 +382,7 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col : &NEIm2ColKernel::run_im2col; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::QASYMM8_SIGNED: case DataType::QASYMM8: _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col : &NEIm2ColKernel::run_im2col; break; @@ -403,7 +404,10 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col : &NEIm2ColKernel::run_im2col; + _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col : &NEIm2ColKernel::run_im2col; + break; + case DataType::QASYMM8_SIGNED: + _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col : &NEIm2ColKernel::run_im2col; break; default: ARM_COMPUTE_ERROR("Data type not supported"); diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp index 649316442e..aa43ad587e 100644 --- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -49,7 +49,9 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output) { //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, + DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); if(biases != nullptr) diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index a730749b8b..bb9620b293 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -59,7 +59,9 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, + DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); if(biases != nullptr) @@ -114,11 +116,12 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset - const QuantizationInfo iqinfo = input->info()->quantization_info(); - const QuantizationInfo wqinfo = weights->info()->quantization_info(); - const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info(); - const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const QuantizationInfo iqinfo = input->info()->quantization_info(); + const QuantizationInfo wqinfo = weights->info()->quantization_info(); + const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info(); + const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = input->info()->data_type(); input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); if(!is_data_type_quantized_per_channel(weights->info()->data_type())) @@ -128,23 +131,28 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w } // Merge activation with output stage - int min_activation = 0; - int max_activation = 255; + PixelValue type_min = 0; + PixelValue type_max = 0; + std::tie(type_min, type_max) = get_min_max(data_type); + int min_activation = type_min.get(); + int max_activation = type_max.get(); if(supported_acts.count(act_info.activation()) != 0) { - const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo); - const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo); + const bool is_quantized_signed = is_data_type_quantized_asymmetric_signed(data_type); + const int a_const_int = is_quantized_signed ? quantize_qasymm8_signed(act_info.a(), uoqinfo) : quantize_qasymm8(act_info.a(), uoqinfo); + const int b_const_int = is_quantized_signed ? quantize_qasymm8_signed(act_info.b(), uoqinfo) : quantize_qasymm8(act_info.b(), uoqinfo); min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int; - max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; + max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? max_activation : a_const_int; } GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL); quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info)); @@ -163,8 +171,9 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) { - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const bool is_activation_enabled = act_info.enabled(); + const DataType data_type = input->data_type(); + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool is_activation_enabled = act_info.enabled(); // Create GEMMInfo structure const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, @@ -181,8 +190,11 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); // Merge activation with output stage - int min_activation = 0; - int max_activation = 255; + PixelValue type_min = 0; + PixelValue type_max = 0; + std::tie(type_min, type_max) = get_min_max(data_type); + int min_activation = type_min.get(); + int max_activation = type_max.get(); const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, @@ -190,18 +202,20 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens }; if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) { - const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo); - const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo); + const bool is_quantized_signed = is_data_type_quantized_asymmetric_signed(data_type); + const int a_const_int = is_quantized_signed ? quantize_qasymm8_signed(act_info.a(), uoqinfo) : quantize_qasymm8(act_info.a(), uoqinfo); + const int b_const_int = is_quantized_signed ? quantize_qasymm8_signed(act_info.b(), uoqinfo) : quantize_qasymm8(act_info.b(), uoqinfo); min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int; - max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; + max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? max_activation : a_const_int; } GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info)); // Perform validation step on GEMMLowp @@ -387,8 +401,8 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON"); diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 5b9d0551e2..e36cb3d399 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -280,9 +280,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp index fef4510405..afefee77be 100644 --- a/tests/validation/Helpers.cpp +++ b/tests/validation/Helpers.cpp @@ -122,53 +122,53 @@ SimpleTensor convert_from_asymmetric(const SimpleTensor &src) } template <> -SimpleTensor convert_from_asymmetric(const SimpleTensor &src) +SimpleTensor convert_from_asymmetric(const SimpleTensor &src) { const UniformQuantizationInfo &quantization_info = src.quantization_info().uniform(); SimpleTensor dst{ src.shape(), DataType::F32, 1, QuantizationInfo(), src.data_layout() }; for(int i = 0; i < src.num_elements(); ++i) { - dst[i] = dequantize_qasymm16(src[i], quantization_info); + dst[i] = dequantize_qasymm8_signed(src[i], quantization_info); } return dst; } template <> -SimpleTensor convert_to_asymmetric(const SimpleTensor &src, const QuantizationInfo &quantization_info) +SimpleTensor convert_from_asymmetric(const SimpleTensor &src) { - SimpleTensor dst{ src.shape(), DataType::QASYMM8, 1, quantization_info }; - const UniformQuantizationInfo &qinfo = quantization_info.uniform(); + const UniformQuantizationInfo &quantization_info = src.quantization_info().uniform(); + SimpleTensor dst{ src.shape(), DataType::F32, 1, QuantizationInfo(), src.data_layout() }; for(int i = 0; i < src.num_elements(); ++i) { - dst[i] = quantize_qasymm8(src[i], qinfo); + dst[i] = dequantize_qasymm16(src[i], quantization_info); } return dst; } template <> -SimpleTensor convert_to_asymmetric(const SimpleTensor &src, const QuantizationInfo &quantization_info) +SimpleTensor convert_to_asymmetric(const SimpleTensor &src, const QuantizationInfo &quantization_info) { - SimpleTensor dst{ src.shape(), DataType::QASYMM8_SIGNED, 1, quantization_info }; + SimpleTensor dst{ src.shape(), DataType::QASYMM8, 1, quantization_info }; const UniformQuantizationInfo &qinfo = quantization_info.uniform(); for(int i = 0; i < src.num_elements(); ++i) { - dst[i] = quantize_qasymm8_signed(src[i], qinfo); + dst[i] = quantize_qasymm8(src[i], qinfo); } return dst; } template <> -SimpleTensor convert_from_asymmetric(const SimpleTensor &src) +SimpleTensor convert_to_asymmetric(const SimpleTensor &src, const QuantizationInfo &quantization_info) { - const UniformQuantizationInfo &quantization_info = src.quantization_info().uniform(); - SimpleTensor dst{ src.shape(), DataType::F32, 1, QuantizationInfo(), src.data_layout() }; + SimpleTensor dst{ src.shape(), DataType::QASYMM8_SIGNED, 1, quantization_info }; + const UniformQuantizationInfo &qinfo = quantization_info.uniform(); for(int i = 0; i < src.num_elements(); ++i) { - dst[i] = dequantize_qasymm8_signed(src[i], quantization_info); + dst[i] = quantize_qasymm8_signed(src[i], qinfo); } return dst; } @@ -354,6 +354,15 @@ std::pair get_quantized_bounds(const QuantizationInfo &quant_info, flo return std::pair { min_bound, max_bound }; } +std::pair get_quantized_qasymm8_signed_bounds(const QuantizationInfo &quant_info, float min, float max) +{ + ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max"); + + const int min_bound = quantize_qasymm8_signed(min, quant_info.uniform()); + const int max_bound = quantize_qasymm8_signed(max, quant_info.uniform()); + return std::pair { min_bound, max_bound }; +} + std::pair get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id) { ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max"); diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h index 3227a98b05..b481b52443 100644 --- a/tests/validation/Helpers.h +++ b/tests/validation/Helpers.h @@ -262,6 +262,14 @@ void zeros(SimpleTensor &in, const Coordinates &anchor, const TensorShape &sh */ std::pair get_quantized_bounds(const QuantizationInfo &quant_info, float min, float max); +/** Helper function to compute asymmetric quantized signed min and max bounds + * + * @param[in] quant_info Quantization info to be used for conversion + * @param[in] min Floating point minimum value to be quantized + * @param[in] max Floating point maximum value to be quantized + */ +std::pair get_quantized_qasymm8_signed_bounds(const QuantizationInfo &quant_info, float min, float max); + /** Helper function to compute symmetric quantized min and max bounds * * @param[in] quant_info Quantization info to be used for conversion diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index c2a0cb56a2..1d7805d024 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -462,6 +462,19 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedFixture } TEST_SUITE_END() // QASYMM8 +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), + QuantizedActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8_SIGNED + TEST_SUITE(QSYMM8_PER_CHANNEL) FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(), diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h index c5cddc28db..3c4b625ac6 100644 --- a/tests/validation/fixtures/ConvolutionLayerFixture.h +++ b/tests/validation/fixtures/ConvolutionLayerFixture.h @@ -52,7 +52,9 @@ template ::type, uint8_t>::value, int32_t, T>::type; + using TBias = typename std::conditional < std::is_same::type, uint8_t>::value + || std::is_same::type, int8_t>::value, + int32_t, T >::type; public: template @@ -84,6 +86,13 @@ protected: library->fill(tensor, distribution, i); break; } + case DataType::QASYMM8_SIGNED: + { + std::pair bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f); + std::uniform_int_distribution distribution(bounds.first, bounds.second); + library->fill(tensor, distribution, i); + break; + } case DataType::QSYMM8_PER_CHANNEL: { int min_bound = 128; diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp index 6cdba09c75..7a699c5f86 100644 --- a/tests/validation/reference/ActivationLayer.cpp +++ b/tests/validation/reference/ActivationLayer.cpp @@ -65,6 +65,17 @@ SimpleTensor activation_layer(const SimpleTensor &src return dst; } +template <> +SimpleTensor activation_layer(const SimpleTensor &src, ActivationLayerInfo info, const QuantizationInfo &oq_info) +{ + const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info; + + SimpleTensor src_tmp = convert_from_asymmetric(src); + SimpleTensor dst_tmp = activation_layer(src_tmp, info); + SimpleTensor dst = convert_to_asymmetric(dst_tmp, dst_qinfo); + return dst; +} + template <> SimpleTensor activation_layer(const SimpleTensor &src, ActivationLayerInfo info, const QuantizationInfo &oq_info) { diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h index 6ac5df93b3..6168f10741 100644 --- a/tests/validation/reference/Convolution3d.h +++ b/tests/validation/reference/Convolution3d.h @@ -24,6 +24,7 @@ #ifndef ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H #define ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H +#include "arm_compute/core/utils/misc/Requires.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "tests/validation/Helpers.h" #include "tests/validation/reference/UtilsQuantizedAsymm.h" @@ -94,10 +95,8 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &wei } // 3D convolution for QASYMM8 type -template < typename T, typename TW, typename TB, typename std::enable_if < std::is_same::value &&(std::is_same::value - || std::is_same::value) - &&std::is_same::value, - int >::type = 0 > +template < typename T, typename TW, typename TB, REQUIRES_TA((std::is_same::value || std::is_same::value) &&(std::is_same::value + || std::is_same::value)) > inline void convolution3d(const SimpleTensor &in, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &out, int i_offset, int w_offset, int b_offset, int o_offset, int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0) @@ -172,7 +171,8 @@ inline void convolution3d(const SimpleTensor &in, const SimpleTensor &wei acc += (*b_ptr); // Quantize down - acc = validation::quantize_down_scale_by_fixedpoint(acc, output_multiplier, output_shift, output_offset, 0, 255); + acc = validation::quantize_down_scale_by_fixedpoint(acc, output_multiplier, output_shift, output_offset, + std::numeric_limits::lowest(), std::numeric_limits::max()); // Store the result *out_ptr = acc; diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp index 4d2c1acb6f..c9ad8d38b9 100644 --- a/tests/validation/reference/ConvolutionLayer.cpp +++ b/tests/validation/reference/ConvolutionLayer.cpp @@ -41,10 +41,6 @@ namespace validation { namespace reference { -namespace -{ -} // namespace - template SimpleTensor convolution_layer_nchw(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, SimpleTensor &dst, const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups) @@ -141,6 +137,8 @@ template SimpleTensor convolution_layer(const SimpleTensor &sr const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info); template SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info); +template SimpleTensor convolution_layer(const SimpleTensor &src, const SimpleTensor &weights, const SimpleTensor &bias, const TensorShape &output_shape, + const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1