From 7485d5a62685cb745ab50e970adb722cb71557ac Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Wed, 4 Jul 2018 09:34:00 +0100 Subject: COMPMID-970 : Remove QS8 / QS16 support Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../core/CL/kernels/CLActivationLayerKernel.h | 4 +- .../core/CL/kernels/CLArithmeticAdditionKernel.h | 12 +- .../CL/kernels/CLArithmeticSubtractionKernel.h | 14 +- .../CL/kernels/CLBatchNormalizationLayerKernel.h | 4 +- .../core/CL/kernels/CLChannelShuffleLayerKernel.h | 4 +- arm_compute/core/CL/kernels/CLCol2ImKernel.h | 4 +- .../kernels/CLConvertFullyConnectedWeightsKernel.h | 4 +- .../CL/kernels/CLDepthConcatenateLayerKernel.h | 4 +- .../core/CL/kernels/CLDepthConvertLayerKernel.h | 9 +- .../CL/kernels/CLDirectConvolutionLayerKernel.h | 4 +- .../CLDirectConvolutionLayerOutputStageKernel.h | 8 +- arm_compute/core/CL/kernels/CLFillBorderKernel.h | 2 +- .../core/CL/kernels/CLGEMMInterleave4x4Kernel.h | 4 +- .../kernels/CLGEMMMatrixAccumulateBiasesKernel.h | 6 +- .../core/CL/kernels/CLGEMMMatrixAdditionKernel.h | 4 +- .../core/CL/kernels/CLGEMMMatrixMultiplyKernel.h | 4 +- .../core/CL/kernels/CLGEMMTranspose1xWKernel.h | 4 +- arm_compute/core/CL/kernels/CLIm2ColKernel.h | 6 +- .../core/CL/kernels/CLNormalizationLayerKernel.h | 4 +- arm_compute/core/CL/kernels/CLPermuteKernel.h | 4 +- .../CL/kernels/CLPixelWiseMultiplicationKernel.h | 12 +- arm_compute/core/CL/kernels/CLPoolingLayerKernel.h | 5 +- arm_compute/core/CL/kernels/CLReshapeLayerKernel.h | 4 +- arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h | 16 +- arm_compute/core/CL/kernels/CLTransposeKernel.h | 6 +- .../core/CL/kernels/CLWeightsReshapeKernel.h | 4 +- .../CL/kernels/CLWidthConcatenateLayerKernel.h | 4 +- arm_compute/core/CPP/kernels/CPPPermuteKernel.h | 4 +- arm_compute/core/FixedPoint.h | 373 ---- arm_compute/core/FixedPoint.inl | 406 ---- .../kernels/GCBatchNormalizationLayerKernel.h | 2 +- arm_compute/core/Helpers.h | 24 +- arm_compute/core/Helpers.inl | 14 - arm_compute/core/ITensorInfo.h | 15 +- arm_compute/core/NEON/NEFixedPoint.h | 1184 ------------ arm_compute/core/NEON/NEFixedPoint.inl | 1958 -------------------- .../core/NEON/kernels/NEActivationLayerKernel.h | 17 +- .../core/NEON/kernels/NEArithmeticAdditionKernel.h | 20 +- .../NEON/kernels/NEArithmeticSubtractionKernel.h | 20 +- .../NEON/kernels/NEBatchNormalizationLayerKernel.h | 21 +- arm_compute/core/NEON/kernels/NECol2ImKernel.h | 4 +- .../kernels/NEConvertFullyConnectedWeightsKernel.h | 4 +- .../NEON/kernels/NEDepthConcatenateLayerKernel.h | 2 +- .../core/NEON/kernels/NEDepthConvertLayerKernel.h | 13 +- .../NEON/kernels/NEDirectConvolutionLayerKernel.h | 8 +- .../NEDirectConvolutionLayerOutputStageKernel.h | 8 +- arm_compute/core/NEON/kernels/NEFillBorderKernel.h | 2 +- .../core/NEON/kernels/NEFillInnerBorderKernel.h | 2 +- .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h | 6 +- .../kernels/NEGEMMMatrixAccumulateBiasesKernel.h | 4 +- .../core/NEON/kernels/NEGEMMMatrixAdditionKernel.h | 4 +- .../core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h | 4 +- .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h | 4 +- arm_compute/core/NEON/kernels/NEIm2ColKernel.h | 4 +- .../core/NEON/kernels/NENormalizationLayerKernel.h | 16 +- arm_compute/core/NEON/kernels/NEPermuteKernel.h | 4 +- .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 24 +- .../core/NEON/kernels/NEPoolingLayerKernel.h | 50 +- .../core/NEON/kernels/NEReshapeLayerKernel.h | 2 +- .../core/NEON/kernels/NESoftmaxLayerKernel.h | 8 +- arm_compute/core/NEON/kernels/NETransposeKernel.h | 6 +- .../core/NEON/kernels/NEWeightsReshapeKernel.h | 4 +- .../NEON/kernels/detail/NEDirectConvolution3x3.h | 14 +- .../kernels/detail/NEDirectConvolutionDetail.h | 249 +-- arm_compute/core/SubTensorInfo.h | 11 - arm_compute/core/TensorInfo.h | 54 +- arm_compute/core/Types.h | 2 - arm_compute/core/Utils.h | 30 - arm_compute/core/Validate.h | 156 -- 69 files changed, 230 insertions(+), 4687 deletions(-) delete mode 100644 arm_compute/core/FixedPoint.h delete mode 100644 arm_compute/core/FixedPoint.inl (limited to 'arm_compute/core') diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h index c6d8f96a87..12d00de7e8 100644 --- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h @@ -51,7 +51,7 @@ public: * @note If the output tensor is a nullptr, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * of the activation function. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] act_info Activation layer information. */ @@ -59,7 +59,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * of the activation function. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data type supported: same as @p input * @param[in] act_info Activation layer information. * diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h index a33cbf321f..f4275f4153 100644 --- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h +++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h @@ -53,17 +53,17 @@ public: ~CLArithmeticAdditionKernel() = default; /** Initialise the kernel's inputs, output and convertion policy. * - * @param[in] input1 First tensor input. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. - * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32. - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8),QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32. + * @param[in] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32. * @param[in] policy Policy to use to handle overflow. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel * - * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32. - * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32. - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32. * @param[in] policy Policy to use to handle overflow. * * @return a status diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h index c5f862a61f..35b918fe4b 100644 --- a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h +++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,17 +55,17 @@ public: /** Initialise the kernel's inputs, output and convertion policy. * - * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. - * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32. + * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16/F32. * @param[in] policy Policy to use to handle overflow. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtractionKernel * - * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. - * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. - * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), S16/F16/F32. * @param[in] policy Policy to use to handle overflow. * * @return a status diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h index 8015f08d1b..9c8d02532a 100644 --- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h @@ -54,7 +54,7 @@ public: * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input @@ -69,7 +69,7 @@ public: * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input diff --git a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h index 684a0e5027..f7bd205ec7 100644 --- a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h @@ -48,14 +48,14 @@ public: ~CLChannelShuffleLayerKernel() = default; /** Configure function's inputs and outputs. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. */ void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups); /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. * diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h index 3779325efe..94f21b1ebc 100644 --- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h +++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h @@ -66,7 +66,7 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * @param[in] input The input tensor to convert. Data types supported: QASYMM8/F16/F32 * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. @@ -74,7 +74,7 @@ public: void configure(const ICLTensor *input, ICLTensor *output, std::pair convolved_dims); /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel * - * @param[in] input The input tensor to convert. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * @param[in] input The input tensor to convert. Data types supported: QASYMM8/F16/F32 * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h index fe24aa9d8c..f5e2f0de89 100644 --- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h +++ b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h @@ -55,7 +55,7 @@ public: ~CLConvertFullyConnectedWeightsKernel() = default; /** Set the input and output tensor. * - * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32. + * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32. * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format. * @param[in] data_layout The data layout the weights have been trained in. @@ -63,7 +63,7 @@ public: void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel * - * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32. + * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32. * @param[in] output The converted weights tensor info. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format. * @param[in] data_layout The data layout the weights have been trained in. diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h index 467bdfab3b..cbcab8f554 100644 --- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -52,7 +52,7 @@ public: ~CLDepthConcatenateLayerKernel() = default; /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] input Input tensor. Data types supported: F16/F32. * @param[in] depth_offset The offset on the Z axis. * @param[in,out] output Output tensor. Data types supported: Same as @p input. * diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h index 3a6310d69e..7e795c607a 100644 --- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,17 +43,14 @@ public: * * Valid conversions Input -> Output : * - * - QS8 -> F32 - * - QS16 -> F32 * - U8 -> U16, S16, U32, S32 * - U16 -> U8, U32, S32 * - S16 -> U8, U32, S32 * - U32 -> U8, U16, S16 * - S32 -> U8, U16, S16 - * - F32 -> QS8, QS16 * - * @param[in] input The input tensor to convert. Data types supported: U8/QS8/U16/S16/QS16/U32/S32/F32. - * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/QS16/U32/S32/F32. + * @param[in] input The input tensor to convert. Data types supported: U8/U16/S16/U32/S32/F32. + * @param[out] output The output tensor. Data types supported: U8/U16/S16/U32/S32/F32. * @param[in] policy Conversion policy * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. */ diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h index eb1bf58b1b..bd37e35334 100644 --- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h @@ -56,7 +56,7 @@ public: * 5x5 convolution with stride_x = 1/2, stride_y = 1/2 * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. @@ -70,7 +70,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h index 9340e9a8d8..1947a98ba3 100644 --- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h +++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h @@ -51,11 +51,11 @@ public: /** Set the accumulate buffer and the biases of the kernel. * * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: S32/QS16/QS32/F16/F32 + * Data type supported: S32/QS32/F16/F32 * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) * Required parameter if output is of QASYMM8 type. - * Data types supported: QS8/QASYMM8/QS16/F16/F32 + * Data types supported: QASYMM8/F16/F32 * @param[in] result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add * @param[in] result_shift (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication * @param[in] result_offset_after_shift (Optional)Offset to be applied to result before converting it back to QASYMM8 @@ -65,10 +65,10 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerOutputStageKernel * * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS16/QS32/F16/F32 + * Data type supported: QS32/F16/F32 * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Data type supported: QS8/QS16/F16/F32 + * Data type supported: F16/F32 * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr); diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h index 18031c7e7e..20e872eccb 100644 --- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h +++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h @@ -51,7 +51,7 @@ public: /** Initialise the kernel's input, output and border mode. * - * @param[in,out] tensor Tensor to process Data types supported: U8/QS8/S16/QS16/S32/F16/F32. + * @param[in,out] tensor Tensor to process Data types supported: U8/S16/S32/F16/F32. * @param[in] border_size Size of the border to fill in elements. * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h index c0fef45afe..7f8e766f1a 100644 --- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h @@ -64,14 +64,14 @@ public: CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block */ void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block * diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h index 2956f93cdc..f201af0d5e 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -46,13 +46,13 @@ public: CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default; /** Set the accumulate buffer and the biases of the kernel. * - * @param[in, out] accum The accumulate tensor to convert. Data types supported: QS8/QS16/F16/F32 + * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input */ void configure(ICLTensor *accum, const ICLTensor *biases); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel * - * @param[in] accum The accumulate tensor to convert. Data types supported: QS8/QS16/F16/F32 + * @param[in] accum The accumulate tensor to convert. Data types supported: F16/F32 * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input * @param[in] gpu_target GPU target * diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h index 3755d943c5..bf8e1d4b17 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h @@ -52,14 +52,14 @@ public: * * @note The input and output tensors must have the same dimensions * - * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32 + * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32 * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref CLGEMMMatrixMultiplyKernel. Data type supported: same as @p input * @param[in] beta Weight of matrix C */ void configure(const ICLTensor *input, ICLTensor *output, float beta); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAdditionKernel. * - * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32 + * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32 * @param[in] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref CLGEMMMatrixMultiplyKernel. Data type supported: same as @p input * @param[in] beta Weight of matrix C * diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h index 15bba0cd0f..1b6a0c87a9 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h @@ -53,7 +53,7 @@ public: CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default; /** Initialise the kernel's input, output and alpha * - * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F16/F32 * @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0 * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 * @param[in] alpha Weight of the matrix product @@ -64,7 +64,7 @@ public: void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel * - * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F16/F32 * @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0 * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 * @param[in] alpha Weight of the matrix product diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h index 9a3069eab6..47a4ad515b 100644 --- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h @@ -70,14 +70,14 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block */ void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor. Data type supported: same as @p input. * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block * diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h index 7e119a32a8..fc930abcbe 100644 --- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h @@ -69,7 +69,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, * while every dimension above represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -81,7 +81,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, * while every dimension above represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -113,7 +113,7 @@ private: /** Chooses and configure the right kernel for the given input arguments. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, * while every dimension above represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h index ef00e59e5c..f2d37a781c 100644 --- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h @@ -48,7 +48,7 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ @@ -56,7 +56,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. * @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. * diff --git a/arm_compute/core/CL/kernels/CLPermuteKernel.h b/arm_compute/core/CL/kernels/CLPermuteKernel.h index b01df64ebd..21da141c0d 100644 --- a/arm_compute/core/CL/kernels/CLPermuteKernel.h +++ b/arm_compute/core/CL/kernels/CLPermuteKernel.h @@ -49,14 +49,14 @@ public: CLPermuteKernel &operator=(CLPermuteKernel &&) = default; /** Set the input and output of the kernel. * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector */ void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration of @ref CLPermuteKernel * - * @param[in] input First tensor input info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32. + * @param[in] input First tensor input info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. * @param[in] output Output tensor info. Data types supported: same as @p input. * @param[in] perm Permutation vector * diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h index fcabb614df..b835aa701b 100644 --- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h @@ -49,11 +49,11 @@ public: CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default; /** Initialise the kernel's input, output and border mode. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32. * @param[in] input2 An input tensor. Data types supported: same as @p input1. - * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. */ @@ -61,11 +61,11 @@ public: ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel * - * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32. * @param[in] input2 An input tensor info. Data types supported: same as @p input1. - * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8. * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. * diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h index c13507785b..db1a756229 100644 --- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h @@ -51,16 +51,15 @@ public: /** Set the input and output tensors. * - * @note QS8 and QS16 are supported only for pool sizes 3, 5 and 7 * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel * - * @param[in] input Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h index 044b5e7006..b253d66f4f 100644 --- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -49,7 +49,7 @@ public: ~CLReshapeLayerKernel() = default; /** Set the input and output of the kernel * - * @param[in] input Source tensor. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h index c562565175..b272878fe7 100644 --- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h @@ -38,13 +38,13 @@ class CLLogits1DMaxKernel : public ICLSimple3DKernel public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32 * @param[out] output Destination tensor. Data types supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32 * @param[in] output Destination tensor. Data types supported: same as @p input * * @return a status @@ -68,7 +68,7 @@ public: CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32 * @param[in] max Max values tensor. Data types supported: same as @p input * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input * @param[out] sum Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input @@ -77,7 +77,7 @@ public: void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f); /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32 * @param[in] max Max values tensor. Data types supported: same as @p input * @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input * @param[in] sum Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input @@ -116,7 +116,7 @@ public: CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: F16/F32 * @param[in,out] max Max values tensor. Data types supported: same as @p input * @param[out] output Destination tensor. Data types supported: same as @p input * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input @@ -125,7 +125,7 @@ public: void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f); /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: F16/F32 * @param[in] max Max values tensor. Data types supported: same as @p input * @param[in] output Destination tensor. Data types supported: same as @p input * @param[in] sum Sum of 1D logits tensor. Data types supported: same as @p input @@ -175,7 +175,7 @@ public: CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32 + * @param[in] input Source tensor. Data types supported: S32/F16/F32 * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input * @param[in] beta (Optional) A scaling factor for the exponent. (Default = 1.0) @@ -183,7 +183,7 @@ public: void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta = 1.0f); /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel * - * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32 + * @param[in] input Source tensor. Data types supported: S32/F16/F32 * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input * diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h index 2e1b481d3f..09d7a8a430 100644 --- a/arm_compute/core/CL/kernels/CLTransposeKernel.h +++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,13 +40,13 @@ class CLTransposeKernel : public ICLSimple2DKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor. Data type supported: Same as @p input * * @return a status diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h index 7a54284199..664fc3c216 100644 --- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h +++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h @@ -69,7 +69,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. @@ -79,7 +79,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h index 5b8a318320..d206eb0da7 100644 --- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h @@ -52,7 +52,7 @@ public: ~CLWidthConcatenateLayerKernel() = default; /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] width_offset The offset on the X axis. * @param[in,out] output Output tensor. Data types supported: Same as @p input. * @@ -60,7 +60,7 @@ public: void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] width_offset The offset on the X axis. * @param[in] output Output tensor info. Data types supported: Same as @p input. * diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h index 3d6b43641e..5e9ae43ee0 100644 --- a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h +++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h @@ -56,14 +56,14 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector */ void configure(const ITensor *input, ITensor *output, const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector * diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h deleted file mode 100644 index 6e00500b10..0000000000 --- a/arm_compute/core/FixedPoint.h +++ /dev/null @@ -1,373 +0,0 @@ -/* - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_FIXEDPOINT_H__ -#define __ARM_COMPUTE_FIXEDPOINT_H__ - -#include - -namespace arm_compute -{ -using qint8_t = int8_t; /**< 8 bit fixed point scalar value */ -using qint16_t = int16_t; /**< 16 bit fixed point scalar value */ -using qint32_t = int32_t; /**< 32 bit fixed point scalar value */ -using qint64_t = int64_t; /**< 64 bit fixed point scalar value */ - -/** 8 bit fixed point scalar saturating shift left - * - * @param[in] a First 8 bit fixed point input - * @param[in] shift Shift amount (positive only values) - * - * @return The result of the 8 bit fixed point shift. The result is saturated in case of overflow - */ -qint8_t sqshl_qs8(qint8_t a, int shift); - -/** 8 bit fixed point scalar shift right - * - * @param[in] a First 8 bit fixed point input - * @param[in] shift Shift amount (positive only values) - * - * @return The result of the 8 bit fixed point shift - */ -qint8_t sshr_qs8(qint8_t a, int shift); - -/** 16 bit fixed point scalar shift right - * - * @param[in] a First 16 bit fixed point input - * @param[in] shift Shift amount (positive only values) - * - * @return The result of the 16 bit fixed point shift - */ -qint16_t sshr_qs16(qint16_t a, int shift); - -/** 16 bit fixed point scalar saturating shift left - * - * @param[in] a First 16 bit fixed point input - * @param[in] shift Shift amount (positive only values) - * - * @return The result of the 16 bit fixed point shift. The result is saturated in case of overflow - */ -qint16_t sqshl_qs16(qint16_t a, int shift); - -/** 8 bit fixed point scalar absolute value - * - * @param[in] a 8 bit fixed point input - * - * @return The result of the 8 bit fixed point absolute value - */ -qint8_t sabs_qs8(qint8_t a); - -/** 16 bit fixed point scalar absolute value - * - * @param[in] a 16 bit fixed point input - * - * @return The result of the 16 bit fixed point absolute value - */ -qint16_t sabs_qs16(qint16_t a); - -/** 8 bit fixed point scalar add - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * - * @return The result of the 8 bit fixed point addition - */ -qint8_t sadd_qs8(qint8_t a, qint8_t b); - -/** 16 bit fixed point scalar add - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * - * @return The result of the 16 bit fixed point addition - */ -qint16_t sadd_qs16(qint16_t a, qint16_t b); - -/** 8 bit fixed point scalar saturating add - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * - * @return The result of the 8 bit fixed point addition. The result is saturated in case of overflow - */ -qint8_t sqadd_qs8(qint8_t a, qint8_t b); - -/** 16 bit fixed point scalar saturating add - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * - * @return The result of the 16 bit fixed point addition. The result is saturated in case of overflow - */ -qint16_t sqadd_qs16(qint16_t a, qint16_t b); - -/** 32 bit fixed point scalar saturating add - * - * @param[in] a First 32 bit fixed point input - * @param[in] b Second 32 bit fixed point input - * - * @return The result of the 32 bit fixed point addition. The result is saturated in case of overflow - */ -qint32_t sqadd_qs32(qint32_t a, qint32_t b); - -/** 8 bit fixed point scalar subtraction - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * - * @return The result of the 8 bit fixed point subtraction - */ -qint8_t ssub_qs8(qint8_t a, qint8_t b); - -/** 16 bit fixed point scalar subtraction - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * - * @return The result of the 16 bit fixed point subtraction - */ -qint16_t ssub_qs16(qint16_t a, qint16_t b); - -/** 8 bit fixed point scalar saturating subtraction - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * - * @return The result of the 8 bit fixed point subtraction. The result is saturated in case of overflow - */ -qint8_t sqsub_qs8(qint8_t a, qint8_t b); - -/** 16 bit fixed point scalar saturating subtraction - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * - * @return The result of the 16 bit fixed point subtraction. The result is saturated in case of overflow - */ -qint16_t sqsub_qs16(qint16_t a, qint16_t b); - -/** 8 bit fixed point scalar multiply - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point multiplication. - */ -qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position); - -/** 16 bit fixed point scalar multiply - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point multiplication. - */ -qint16_t smul_qs16(qint16_t a, qint16_t b, int fixed_point_position); - -/** 8 bit fixed point scalar saturating multiply - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point multiplication. The result is saturated in case of overflow - */ -qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position); - -/** 16 bit fixed point scalar saturating multiply - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow - */ -qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position); - -/** 8 bit fixed point scalar multiply long - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point multiplication long. The result is saturated in case of overflow - */ -qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position); - -/** 16 bit fixed point scalar multiply long - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point multiplication long. The result is saturated in case of overflow - */ -qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position); - -/** 16 bit fixed point scalar saturating multiply - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow - */ -qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position); - -/** 8 bit fixed point scalar inverse square root - * - * @param[in] a 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point inverse square root. - */ -qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position); - -/** 16 bit fixed point scalar inverse square root - * - * @param[in] a 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point inverse square root. - */ -qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position); - -/** 8 bit fixed point scalar division - * - * @param[in] a First 8 bit fixed point input - * @param[in] b Second 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point division. - */ -qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position); - -/** 16 bit fixed point scalar division - * - * @param[in] a First 16 bit fixed point input - * @param[in] b Second 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point division. - */ -qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position); - -/** 8 bit fixed point scalar exponential - * - * @param[in] a 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point exponential. - */ -qint8_t sqexp_qs8(qint8_t a, int fixed_point_position); - -/** 16 bit fixed point scalar exponential - * - * @param[in] a 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point exponential. - */ -qint16_t sqexp_qs16(qint16_t a, int fixed_point_position); - -/** 16 bit fixed point scalar exponential - * - * @param[in] a 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point exponential. - */ -qint16_t sexp_qs16(qint16_t a, int fixed_point_position); - -/** 8 bit fixed point scalar logarithm - * - * @param[in] a 8 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point logarithm. - */ -qint8_t slog_qs8(qint8_t a, int fixed_point_position); - -/** 16 bit fixed point scalar logarithm - * - * @param[in] a 16 bit fixed point input - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point logarithm. - */ -qint16_t slog_qs16(qint16_t a, int fixed_point_position); - -/** Convert an 8 bit fixed point to float - * - * @param[in] a Input to convert - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion 8 bit fixed point -> float - */ -float scvt_f32_qs8(qint8_t a, int fixed_point_position); - -/** Convert a float to 8 bit fixed point - * - * @param[in] a Input to convert - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion float -> 8 bit fixed point - */ -qint8_t sqcvt_qs8_f32(float a, int fixed_point_position); - -/** Convert a 16 bit fixed point to float - * - * @param[in] a Input to convert - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion 16 bit fixed point -> float - */ -float scvt_f32_qs16(qint16_t a, int fixed_point_position); - -/** Convert a float to 16 bit fixed point - * - * @param[in] a Input to convert - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion float -> 16 bit fixed point - */ -qint16_t sqcvt_qs16_f32(float a, int fixed_point_position); - -/** Scalar saturating move and narrow. - * - * @param[in] a Input to convert to 8 bit fixed point - * - * @return The narrowing conversion to 8 bit - */ -qint8_t sqmovn_qs16(qint16_t a); - -/** Scalar saturating move and narrow. - * - * @param[in] a Input to convert to 16 bit fixed point - * - * @return The narrowing conversion to 16 bit - */ -qint16_t sqmovn_qs32(qint32_t a); -} -#include "arm_compute/core/FixedPoint.inl" -#endif /* __ARM_COMPUTE_FIXEDPOINT_H__ */ diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl deleted file mode 100644 index eb3516e8db..0000000000 --- a/arm_compute/core/FixedPoint.inl +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Error.h" -#include "arm_compute/core/utils/misc/Utility.h" - -#include -#include - -namespace arm_compute -{ -inline qint8_t sqshl_qs8(qint8_t a, int shift) -{ - qint16_t tmp = static_cast(a) << shift; - - // Saturate the result in case of overflow and cast to qint8_t - return utility::saturate_cast(tmp); -} - -inline qint16_t sqshl_qs16(qint16_t a, int shift) -{ - qint32_t tmp = static_cast(a) << shift; - - // Saturate the result in case of overflow and cast to qint16_t - return utility::saturate_cast(tmp); -} - -inline qint8_t sshr_qs8(qint8_t a, int shift) -{ - ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero"); - const qint8_t round_val = 1 << (shift - 1); - return sqadd_qs8(a, round_val) >> shift; -} - -inline qint16_t sshr_qs16(qint16_t a, int shift) -{ - ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero"); - const qint16_t round_val = 1 << (shift - 1); - return sqadd_qs16(a, round_val) >> shift; -} - -inline qint8_t sabs_qs8(qint8_t a) -{ - return (a < 0) ? (a == std::numeric_limits::min()) ? std::numeric_limits::max() : -a : a; -} - -inline qint16_t sabs_qs16(qint16_t a) -{ - return (a < 0) ? (a == std::numeric_limits::min()) ? std::numeric_limits::max() : -a : a; -} - -inline qint8_t sadd_qs8(qint8_t a, qint8_t b) -{ - return a + b; -} - -inline qint16_t sadd_qs16(qint16_t a, qint16_t b) -{ - return a + b; -} - -inline qint8_t sqadd_qs8(qint8_t a, qint8_t b) -{ - // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow - qint16_t tmp = (static_cast(a) + static_cast(b)); - - // Saturate the result in case of overflow and cast to qint8_t - return utility::saturate_cast(tmp); -} - -inline qint16_t sqadd_qs16(qint16_t a, qint16_t b) -{ - // We need to store the temporary result in qint32_t otherwise we cannot evaluate the overflow - qint32_t tmp = (static_cast(a) + static_cast(b)); - - // Saturate the result in case of overflow and cast to qint16_t - return utility::saturate_cast(tmp); -} - -inline qint32_t sqadd_qs32(qint32_t a, qint32_t b) -{ - // We need to store the temporary result in qint64_t otherwise we cannot evaluate the overflow - qint64_t tmp = (static_cast(a) + static_cast(b)); - - // Saturate the result in case of overflow and cast to qint32_t - return utility::saturate_cast(tmp); -} - -inline qint8_t ssub_qs8(qint8_t a, qint8_t b) -{ - return a - b; -} - -inline qint16_t ssub_qs16(qint16_t a, qint16_t b) -{ - return a - b; -} - -inline qint8_t sqsub_qs8(qint8_t a, qint8_t b) -{ - // We need to store the temporary result in uint16_t otherwise we cannot evaluate the overflow - qint16_t tmp = static_cast(a) - static_cast(b); - - // Saturate the result in case of overflow and cast to qint8_t - return utility::saturate_cast(tmp); -} - -inline qint16_t sqsub_qs16(qint16_t a, qint16_t b) -{ - // We need to store the temporary result in qint32_t otherwise we cannot evaluate the overflow - qint32_t tmp = static_cast(a) - static_cast(b); - - // Saturate the result in case of overflow and cast to qint16_t - return utility::saturate_cast(tmp); -} - -inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position) -{ - const qint16_t round_up_const = (1 << (fixed_point_position - 1)); - - qint16_t tmp = static_cast(a) * static_cast(b); - - // Rounding up - tmp += round_up_const; - - return static_cast(tmp >> fixed_point_position); -} - -inline qint16_t smul_qs16(qint16_t a, qint16_t b, int fixed_point_position) -{ - const qint32_t round_up_const = (1 << (fixed_point_position - 1)); - - qint32_t tmp = static_cast(a) * static_cast(b); - - // Rounding up - tmp += round_up_const; - - return static_cast(tmp >> fixed_point_position); -} - -inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position) -{ - const qint16_t round_up_const = (1 << (fixed_point_position - 1)); - - qint16_t tmp = static_cast(a) * static_cast(b); - - // Rounding up - tmp += round_up_const; - - return utility::saturate_cast(tmp >> fixed_point_position); -} - -inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position) -{ - const qint32_t round_up_const = (1 << (fixed_point_position - 1)); - - qint32_t tmp = static_cast(a) * static_cast(b); - - // Rounding up - tmp += round_up_const; - - return utility::saturate_cast(tmp >> fixed_point_position); -} - -inline qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position) -{ - const qint16_t round_up_const = (1 << (fixed_point_position - 1)); - - qint16_t tmp = static_cast(a) * static_cast(b); - - // Rounding up - tmp += round_up_const; - - return tmp >> fixed_point_position; -} - -inline qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position) -{ - const qint32_t round_up_const = (1 << (fixed_point_position - 1)); - - qint32_t tmp = static_cast(a) * static_cast(b); - - // Rounding up - tmp += round_up_const; - - return tmp >> fixed_point_position; -} - -inline qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position) -{ - const qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24)); - - const qint8_t const_three = (3 << fixed_point_position); - qint8_t temp = shift < 0 ? (a << -shift) : (a >> shift); - qint8_t x2 = temp; - - // We need three iterations to find the result - for(int i = 0; i < 3; ++i) - { - qint8_t three_minus_dx = ssub_qs8(const_three, smul_qs8(temp, smul_qs8(x2, x2, fixed_point_position), fixed_point_position)); - x2 = (smul_qs8(x2, three_minus_dx, fixed_point_position) >> 1); - } - - temp = shift < 0 ? (x2 << (-shift >> 1)) : (x2 >> (shift >> 1)); - - return temp; -} - -inline qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position) -{ - const qint16_t shift = 16 - (fixed_point_position + (__builtin_clz(a) - 16)); - - const qint16_t const_three = (3 << fixed_point_position); - qint16_t temp = shift < 0 ? (a << -shift) : (a >> shift); - qint16_t x2 = temp; - - // We need three iterations to find the result - for(int i = 0; i < 3; ++i) - { - qint16_t three_minus_dx = ssub_qs16(const_three, smul_qs16(temp, smul_qs16(x2, x2, fixed_point_position), fixed_point_position)); - x2 = smul_qs16(x2, three_minus_dx, fixed_point_position) >> 1; - } - - temp = shift < 0 ? (x2 << ((-shift) >> 1)) : (x2 >> (shift >> 1)); - - return temp; -} - -inline qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position) -{ - const qint16_t temp = a << fixed_point_position; - return static_cast(temp / b); -} - -inline qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position) -{ - const qint32_t temp = a << fixed_point_position; - return static_cast(temp / b); -} - -inline qint8_t sqexp_qs8(qint8_t a, int fixed_point_position) -{ - // Constants - const qint8_t const_one = (1 << fixed_point_position); - const qint8_t ln2 = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1; - const qint8_t inv_ln2 = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one; - const qint8_t A = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1; - const qint8_t B = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1; - const qint8_t C = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1; - const qint8_t D = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1; - - // Polynomial expansion - const int dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position); - const qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position))); - qint8_t sum = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C); - sum = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B); - sum = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A); - sum = sqmul_qs8(alpha, sum, fixed_point_position); - sum = sqadd_qs8(sum, const_one); - - return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs8(sum, dec_a); -} - -inline qint16_t sqexp_qs16(qint16_t a, int fixed_point_position) -{ - // Constants - const qint16_t const_one = (1 << fixed_point_position); - const qint16_t ln2 = ((0x58B9 >> (14 - fixed_point_position)) + 1) >> 1; - const qint16_t inv_ln2 = (((0x38AA >> (14 - fixed_point_position)) + 1) >> 1) | const_one; - const qint16_t A = ((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1; - const qint16_t B = ((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1; - const qint16_t C = ((0x1693 >> (14 - fixed_point_position)) + 1) >> 1; - const qint16_t D = ((0x0592 >> (14 - fixed_point_position)) + 1) >> 1; - - // Polynomial expansion - const int dec_a = (sqmul_qs16(a, inv_ln2, fixed_point_position) >> fixed_point_position); - const qint16_t alpha = sabs_qs16(sqsub_qs16(a, sqmul_qs16(ln2, sqshl_qs16(dec_a, fixed_point_position), fixed_point_position))); - qint16_t sum = sqadd_qs16(sqmul_qs16(alpha, D, fixed_point_position), C); - sum = sqadd_qs16(sqmul_qs16(alpha, sum, fixed_point_position), B); - sum = sqadd_qs16(sqmul_qs16(alpha, sum, fixed_point_position), A); - sum = sqmul_qs16(alpha, sum, fixed_point_position); - sum = sqadd_qs16(sum, const_one); - - return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs16(sum, dec_a); -} - -inline qint8_t slog_qs8(qint8_t a, int fixed_point_position) -{ - // Constants - qint8_t const_one = (1 << fixed_point_position); - qint8_t ln2 = (0x58 >> (7 - fixed_point_position)); - qint8_t A = (0x5C >> (7 - fixed_point_position - 1)); - qint8_t B = -(0x56 >> (7 - fixed_point_position)); - qint8_t C = (0x29 >> (7 - fixed_point_position)); - qint8_t D = -(0x0A >> (7 - fixed_point_position)); - - if((const_one == a) || (a < 0)) - { - return 0; - } - else if(a < const_one) - { - return -slog_qs8(sdiv_qs8(const_one, a, fixed_point_position), fixed_point_position); - } - - // Remove even powers of 2 - qint8_t shift_val = 31 - __builtin_clz(a >> fixed_point_position); - a >>= shift_val; - a = ssub_qs8(a, const_one); - - // Polynomial expansion - qint8_t sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C); - sum = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B); - sum = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A); - sum = sqmul_qs8(a, sum, fixed_point_position); - - return smul_qs8(sadd_qs8(sum, shift_val << fixed_point_position), ln2, fixed_point_position); -} - -inline qint16_t slog_qs16(qint16_t a, int fixed_point_position) -{ - // Constants - qint16_t const_one = (1 << fixed_point_position); - qint16_t ln2 = (0x58B9 >> (7 - fixed_point_position)); - qint16_t A = (0x5C0F >> (7 - fixed_point_position - 1)); - qint16_t B = -(0x56AE >> (7 - fixed_point_position)); - qint16_t C = (0x2933 >> (7 - fixed_point_position)); - qint16_t D = -(0x0AA7 >> (7 - fixed_point_position)); - - if((const_one == a) || (a < 0)) - { - return 0; - } - else if(a < const_one) - { - return -slog_qs16(sdiv_qs16(const_one, a, fixed_point_position), fixed_point_position); - } - - // Remove even powers of 2 - qint16_t shift_val = 31 - __builtin_clz(a >> fixed_point_position); - a >>= shift_val; - a = ssub_qs16(a, const_one); - - // Polynomial expansion - qint16_t sum = sqadd_qs16(sqmul_qs16(a, D, fixed_point_position), C); - sum = sqadd_qs16(sqmul_qs16(a, sum, fixed_point_position), B); - sum = sqadd_qs16(sqmul_qs16(a, sum, fixed_point_position), A); - sum = sqmul_qs16(a, sum, fixed_point_position); - - return smul_qs16(sadd_qs16(sum, shift_val << fixed_point_position), ln2, fixed_point_position); -} - -inline float scvt_f32_qs8(qint8_t a, int fixed_point_position) -{ - return static_cast(a) / (1 << fixed_point_position); -} - -inline qint8_t sqcvt_qs8_f32(float a, int fixed_point_position) -{ - // round_nearest_integer(a * 2^(fixed_point_position)) - return utility::saturate_cast(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5)); -} - -inline float scvt_f32_qs16(qint16_t a, int fixed_point_position) -{ - return static_cast(a) / (1 << fixed_point_position); -} - -inline qint16_t sqcvt_qs16_f32(float a, int fixed_point_position) -{ - // round_nearest_integer(a * 2^(fixed_point_position)) - return utility::saturate_cast(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5)); -} - -inline qint8_t sqmovn_qs16(qint16_t a) -{ - // Saturate the result in case of overflow and cast to qint8_t - return utility::saturate_cast(a); -} - -inline qint16_t sqmovn_qs32(qint32_t a) -{ - // Saturate the result in case of overflow and cast to qint16_t - return utility::saturate_cast(a); -} -} diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h index bf971a2729..fcbc3495c3 100644 --- a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h @@ -66,7 +66,7 @@ public: * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h index a3cbfb94e3..374e36442b 100644 --- a/arm_compute/core/Helpers.h +++ b/arm_compute/core/Helpers.h @@ -582,21 +582,19 @@ inline void permute(TensorShape &shape, const PermutationVector &perm) } } -/** Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty. +/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty. * - * @param[in,out] info Tensor info used to check and assign. - * @param[in] shape New shape. - * @param[in] num_channels New number of channels. - * @param[in] data_type New data type - * @param[in] fixed_point_position New fixed point position - * @param[in] quantization_info (Optional) New quantization info + * @param[in,out] info Tensor info used to check and assign. + * @param[in] shape New shape. + * @param[in] num_channels New number of channels. + * @param[in] data_type New data type + * @param[in] quantization_info (Optional) New quantization info * * @return True if the tensor info has been initialized */ bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, - int fixed_point_position, QuantizationInfo quantization_info = QuantizationInfo()); /** Auto initialize the tensor info using another tensor info. @@ -647,16 +645,6 @@ bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type); */ bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout); -/** Set the fixed point position to the specified value if - * the current fixed point position is 0 and the data type is QS8 or QS16 - * - * @param[in,out] info Tensor info used to check and assign. - * @param[in] fixed_point_position New fixed point position - * - * @return True if the fixed point position has been changed. - */ -bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position); - /** Set the quantization info to the specified value if * the current quantization info is empty and the data type of asymmetric quantized type * diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl index b359811328..c0e4ab8d7d 100644 --- a/arm_compute/core/Helpers.inl +++ b/arm_compute/core/Helpers.inl @@ -202,7 +202,6 @@ inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, - int fixed_point_position, QuantizationInfo quantization_info) { if(info.tensor_shape().total_size() == 0) @@ -210,7 +209,6 @@ inline bool auto_init_if_empty(ITensorInfo &info, info.set_data_type(data_type); info.set_num_channels(num_channels); info.set_tensor_shape(shape); - info.set_fixed_point_position(fixed_point_position); info.set_quantization_info(quantization_info); return true; } @@ -225,7 +223,6 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s info_sink.set_data_type(info_source.data_type()); info_sink.set_num_channels(info_source.num_channels()); info_sink.set_tensor_shape(info_source.tensor_shape()); - info_sink.set_fixed_point_position(info_source.fixed_point_position()); info_sink.set_quantization_info(info_source.quantization_info()); info_sink.set_data_layout(info_source.data_layout()); return true; @@ -278,17 +275,6 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout return false; } -inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position) -{ - if(info.fixed_point_position() == 0 && (info.data_type() == DataType::QS8 || info.data_type() == DataType::QS16)) - { - info.set_fixed_point_position(fixed_point_position); - return true; - } - - return false; -} - inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info) { if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type()))) diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h index ce0cf53fdf..f113445fb7 100644 --- a/arm_compute/core/ITensorInfo.h +++ b/arm_compute/core/ITensorInfo.h @@ -81,15 +81,6 @@ public: * @return Reference to this ITensorInfo object */ virtual ITensorInfo &set_tensor_shape(const TensorShape &shape) = 0; - /** Set the fixed point position to the specified value - * - * @warning The fixed point position must be set once the data type has been configured - * - * @param[in] fixed_point_position The new fixed point position - * - * @return Reference to this ITensorInfo object - */ - virtual ITensorInfo &set_fixed_point_position(int fixed_point_position) = 0; /** Set the quantization settings (scale and offset) of the tensor. * * @param[in] quantization_info QuantizationInfo containing the scale and offset @@ -158,11 +149,7 @@ public: * @return Offset in bytes from the beginning of the memory allocation to access the element (x, y, z, ...) */ virtual size_t offset_element_in_bytes(const Coordinates &pos) const = 0; - /** Fixed point position used when the tensor data type is QS8 or QS16 - * - * @return The fixed point position that expresses the number of bits for the fractional part of the number - */ - virtual int fixed_point_position() const = 0; + /** Element size in bytes calculated as data_size() * num_channels() * * @return The size of one element in bytes diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 504ec6c444..ce64a8e58b 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -24,1194 +24,10 @@ #ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__ #define __ARM_COMPUTE_NEFIXEDPOINT_H__ -#include "arm_compute/core/FixedPoint.h" - #include namespace arm_compute { -using qint8x8_t = int8x8_t; /**< 8 bit fixed point vector with 8 elements */ -using qint8x8x2_t = int8x8x2_t; /**< 8 bit fixed point vector with 16 elements */ -using qint8x8x3_t = int8x8x3_t; /**< 8 bit fixed point vector with 24 elements */ -using qint8x8x4_t = int8x8x4_t; /**< 8 bit fixed point vector with 32 elements */ -using qint8x16_t = int8x16_t; /**< 8 bit fixed point vector with 16 elements */ -using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */ -using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */ -using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */ -using qint16x4_t = int16x4_t; /**< 16 bit fixed point vector with 4 elements */ -using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */ -using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */ -using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */ -using qint16x8_t = int16x8_t; /**< 16 bit fixed point vector with 8 elements */ -using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */ -using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */ -using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */ -using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */ -using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */ -using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */ - -/** Get the lower half of a 16 elements vector - * - * @param[in] a vector of 16 elements - * - * @return 8 bit fixed point vector (8 elements) - */ -qint8x8_t vget_low_qs8(qint8x16_t a); - -/** Get the lower half of a 16 elements vector - * - * @param[in] a vector of 8 elements - * - * @return 16 bit fixed point vector (4 elements) - */ -qint16x4_t vget_low_qs16(qint16x8_t a); - -/** Get the higher half of a 16 elements vector - * - * @param[in] a vector of 16 elements - * - * @return 8 bit fixed point vector (8 elements) - */ -qint8x8_t vget_high_qs8(qint8x16_t a); - -/** Get the higher half of a 16 elements vector - * - * @param[in] a vector of 8 elements - * - * @return 16 bit fixed point vector (4 elements) - */ -qint16x4_t vget_high_qs16(qint16x8_t a); - -/** Load a single 8 bit fixed point vector from memory (8 elements) - * - * @param[in] addr Memory address of the 8 bit fixed point vector to load - * - * @return 8 bit fixed point vector (8 elements) - */ -qint8x8_t vld1_qs8(const qint8_t *addr); - -/** Load a single 16 bit fixed point vector from memory (4 elements) - * - * @param[in] addr Memory address of the 16 bit fixed point vector to load - * - * @return 16 bit fixed point vector (4 elements) - */ -qint16x4_t vld1_qs16(const qint16_t *addr); - -/** Load a single 8 bit fixed point vector from memory (16 elements) - * - * @param[in] addr Memory address of the 8 bit fixed point vector to load - * - * @return 8 bit fixed point vector (16 elements) - */ -qint8x16_t vld1q_qs8(const qint8_t *addr); - -/** Load a single 16 bit fixed point vector from memory (8 elements) - * - * @param[in] addr Memory address of the 16 bit fixed point vector to load - * - * @return 16 bit fixed point vector (8 elements) - */ -qint16x8_t vld1q_qs16(const qint16_t *addr); - -/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements) - * - * @param[in] addr Memory address of the 8 bit fixed point scalar value to load - * - * @return 8 bit fixed point vector (8 elements) - */ -qint8x8_t vld1_dup_qs8(const qint8_t *addr); - -/** Load all lanes of 16 bit fixed point vector with same value from memory (4 elements) - * - * @param[in] addr Memory address of the 16 bit fixed point scalar value to load - * - * @return 16 bit fixed point vector (4 elements) - */ -qint16x4_t vld1_dup_qs16(const qint16_t *addr); - -/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements) - * - * @param[in] addr Memory address of the 8 bit fixed point scalar value to load - * - * @return 8 bit fixed point vector (16 elements) - */ -qint8x16_t vld1q_dup_qs8(const qint8_t *addr); - -/** Load all lanes of 16 bit fixed point vector with same value from memory (8 elements) - * - * @param[in] addr Memory address of the 16 bit fixed point scalar value to load - * - * @return 16 bit fixed point vector (8 elements) - */ -qint16x8_t vld1q_dup_qs16(const qint16_t *addr); - -/** Load two 16 bit fixed point vectors from memory (8x2 elements) - * - * @param[in] addr Memory address of the 16 bit fixed point vectors to load - * - * @return 16 bit fixed point vectors (8x2 elements) - */ -qint16x8x2_t vld2q_qs16(qint16_t *addr); - -/** Store a single 8 bit fixed point vector to memory (8 elements) - * - * @param[in] addr Memory address where the 8 bit fixed point vector should be stored - * @param[in] b 8 bit fixed point vector to store - * - */ -void vst1_qs8(qint8_t *addr, qint8x8_t b); - -/** Store a single 16 bit fixed point vector to memory (4 elements) - * - * @param[in] addr Memory address where the 16 bit fixed point vector should be stored - * @param[in] b 16 bit fixed point vector to store - * - */ -void vst1_qs16(qint16_t *addr, qint16x4_t b); - -/** Store a single 8 bit fixed point vector to memory (16 elements) - * - * @param[in] addr Memory address where the 8 bit fixed point vector should be stored - * @param[in] b 8 bit fixed point vector to store - * - */ -void vst1q_qs8(qint8_t *addr, qint8x16_t b); - -/** Store a single 16 bit fixed point vector to memory (8 elements) - * - * @param[in] addr Memory address where the 16 bit fixed point vector should be stored - * @param[in] b 16 bit fixed point vector to store - * - */ -void vst1q_qs16(qint16_t *addr, qint16x8_t b); - -/** Store two 16 bit fixed point vector to memory (8x2 elements) - * - * @param[in] addr Memory address where the 16 bit fixed point vectors should be stored - * @param[in] b 16 bit fixed point vectors to store - * - */ -void vst2q_qs16(qint16_t *addr, qint16x8x2_t b); - -/** 16 bit fixed point vector saturating narrow (8 elements) - * - * @param[in] a 16 bit fixed point vector to convert - * - * @return 8 bit fixed point vector - */ -qint8x8_t vqmovn_q16(qint16x8_t a); - -/** 32 bit fixed point vector saturating narrow (4 elements) - * - * @param[in] a 32 bit fixed point vector to convert - * - * @return 16 bit fixed point vector - */ -qint16x4_t vqmovn_q32(qint32x4_t a); - -/** 8 bit fixed point vector duplicate (8 elements) - * - * @param[in] a 8 bit fixed point to duplicate - * - * @return The result of the vector duplication - */ -qint8x8_t vdup_n_qs8(qint8_t a); - -/** 16 bit fixed point vector duplicate (4 elements) - * - * @param[in] a 16 bit fixed point to duplicate - * - * @return The result of the vector duplication - */ -qint16x4_t vdup_n_qs16(qint16_t a); - -/** 8 bit fixed point vector duplicate (16 elements) - * - * @param[in] a 8 bit fixed point to duplicate - * - * @return The result of the vector duplication - */ -qint8x16_t vdupq_n_qs8(qint8_t a); - -/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements) - * - * @param[in] a floating point value to convert and duplicate - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the vector duplication - */ -qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position); - -/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements) - * - * @param[in] a floating point value to convert and duplicate - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the vector duplication - */ -qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position); - -/** 16 bit fixed point vector duplicate (8 elements) - * - * @param[in] a 16 bit fixed point to duplicate - * - * @return The result of the vector duplication - */ -qint16x8_t vdupq_n_qs16(qint16x8_t a); - -/** Absolute value of 8 bit fixed point vector (8 elements) - * - * @param[in] a 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector absolute value - */ -qint8x8_t vabs_qs8(qint8x8_t a); - -/** Absolute value of 16 bit fixed point vector (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector absolute value - */ -qint16x4_t vabs_qs16(qint16x4_t a); - -/** Absolute value of 8 bit fixed point vector (16 elements) - * - * @param[in] a 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector absolute value - */ -qint8x16_t vabsq_qs8(qint8x16_t a); - -/** Absolute value of 16 bit fixed point vector (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector absolute value - */ -qint16x8_t vabsq_qs16(qint16x8_t a); - -/** Saturating absolute value of 8 bit fixed point vector (8 elements) - * - * @param[in] a 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector absolute value - */ -qint8x8_t vqabs_qs8(qint8x8_t a); - -/** Saturating absolute value of 16 bit fixed point vector (4 elements) - * - * @param[in] a 4 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector absolute value - */ -qint16x4_t vqabs_qs16(qint16x4_t a); - -/** Saturating absolute value of 8 bit fixed point vector (16 elements) - * - * @param[in] a 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector absolute value - */ -qint8x16_t vqabsq_qs8(qint8x16_t a); - -/** Saturating absolute value of 16 bit fixed point vector (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector absolute value - */ -qint16x8_t vqabsq_qs16(qint16x8_t a); - -/** 8 bit fixed point vector max (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector max operation - */ -qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector max (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector max operation - */ -qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector max (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector max operation - */ -qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b); - -/** 16 bit fixed point vector max (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector max operation - */ -qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b); - -/** 8 bit fixed point vector pairwise max (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector pairwise max operation - */ -qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector pairwise max (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector pairwise max operation - */ -qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector min (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector max operation - */ -qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector min (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector max operation - */ -qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector min (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector min operation - */ -qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b); - -/** 16 bit fixed point vector min (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector min operation - */ -qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b); - -/** 8 bit fixed point vector pairwise min (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector pairwise min operation - */ -qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector pairwise min (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector pairwise min operation - */ -qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector add (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector addition - */ -qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector add (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector addition - */ -qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector add (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector addition - */ -qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b); - -/** 16 bit fixed point vector add (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector addition - */ -qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b); - -/** 8 bit fixed point vector saturating add (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow - */ -qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector saturating add (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow - */ -qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector saturating add (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow - */ -qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b); - -/** 16 bit fixed point vector saturating add (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow - */ -qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b); - -/** 8 bit fixed point vector saturating pairwise add (8 elements) - * - * @param[in] a 8 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow - */ -int16x4_t vpaddl_qs8(qint8x8_t a); - -/** 8 bit fixed point vector subtraction (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector subtraction - */ -qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector subtraction (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector subtraction - */ -qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector subtraction (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector subtraction - */ -qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b); - -/** 16 bit fixed point vector subtraction (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector subtraction - */ -qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b); - -/** 8 bit fixed point vector saturating subtraction (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow - */ -qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b); - -/** 16 bit fixed point vector saturating subtraction (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow - */ -qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b); - -/** 8 bit fixed point vector saturating subtraction (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * - * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow - */ -qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b); - -/** 16 bit fixed point vector saturating subtraction (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * - * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow - */ -qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b); - -/** 8 bit fixed point vector multiply (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiplication. - */ -qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); - -/** 16 bit fixed point vector multiply (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiplication. - */ -qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); - -/** 8 bit fixed point vector multiply (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiplication. - */ -qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); - -/** 16 bit fixed point vector multiply (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiplication. - */ -qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); - -/** 8 bit fixed point vector saturating multiply (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow - */ -qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); - -/** 16 bit fixed point vector saturating multiply (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow - */ -qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); - -/** 8 bit fixed point vector saturating multiply (16 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow - */ -qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); - -/** 16 bit fixed point vector saturating multiply (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow - */ -qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); - -/** 8 bit fixed point vector long multiply (8 elements) - * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point long vector multiplication. - */ -qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); - -/** 16 bit fixed point vector long multiply (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 32 bit fixed point long vector multiplication. - */ -qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); - -/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 8 bit fixed point input vector - * @param[in] c Third 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiply-accumulate - */ -qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); - -/** 16 bit fixed point vector multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 16 bit fixed point input vector - * @param[in] c Third 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiply-accumulate - */ -qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); - -/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 8 bit fixed point input vector - * @param[in] c Third 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiply-accumulate - */ -qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position); - -/** 16 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 16 bit fixed point input vector - * @param[in] c Third 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiply-accumulate - */ -qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position); - -/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 8 bit fixed point input vector - * @param[in] c Third 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow - */ -qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); - -/** 16 bit fixed point vector saturating multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 16 bit fixed point input vector - * @param[in] c Third 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow - */ -qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); - -/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 8 bit fixed point input vector - * @param[in] c Third 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow - */ -qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position); - -/** 16 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). - * - * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 16 bit fixed point input vector - * @param[in] c Third 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow - */ -qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position); - -/** 8 bit fixed point vector multiply-accumulate long (8 elements). - * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements - * - * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 8 bit fixed point input vector - * @param[in] c Third 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiply-accumulate long - */ -qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); - -/** 16 bit fixed point vector multiply-accumulate long (4 elements). - * This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements - * - * @param[in] a First 32 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 16 bit fixed point input vector - * @param[in] c Third 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiply-accumulate long - */ -qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); - -/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector. - * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements - * - * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 8 bit fixed point input vector - * @param[in] c Third 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8 bit fixed point vector multiply-accumulate long - */ -qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); - -/** 16 bit fixed point vector saturating multiply-accumulate long (4 elements). The saturation is performed on the 16 bit fixed point output vector. - * This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements - * - * @param[in] a First 32 bit fixed point input vector where the result of multiplication must be added to - * @param[in] b Second 16 bit fixed point input vector - * @param[in] c Third 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit fixed point vector multiply-accumulate long - */ -qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); - -/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements - * - * @param[in] a Float input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow - */ -qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position); - -/** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements - * - * @param[in] a Float input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow - */ -qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position); - -/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements - * - * @param[in] a Float input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow - */ -qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position); - -/** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements - * - * @param[in] a Float input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow - */ -qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position); - -/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements - * - * @param[in] a 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion 8 bit fixed point -> float32x2x4 - */ -float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position); - -/** Convert a 16 bit fixed point vector with 4 elements to a float vector with 4 elements - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion 16 bit fixed point -> float32x2 - */ -float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position); - -/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements - * - * @param[in] a 8 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion 8 bit fixed point -> float32x4x4 - */ -float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position); - -/** Convert a 16 bit fixed point vector with 8 elements to a float vector with 4x2 elements - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the conversion 16 bit fixed point -> float32x4x2 - */ -float32x4x2_t vcvtq_qs16_f32(qint16x8_t a, int fixed_point_position); - -/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit reciprocal (1/a). - */ -qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position); - -/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit reciprocal (1/a). - */ -qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position); - -/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit reciprocal (1/a). - */ -qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position); - -/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit reciprocal (1/a). - */ -qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position); - -/** Division fixed point 8bit (8 elements) - * - * @param[in] a First 8bit fixed point input vector - * @param[in] b Second 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The quotient and remainder number in fixed point format. - */ -qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position); - -/** Division fixed point 16 bit (4 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The quotient and remainder number in fixed point format. - */ -qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); - -/** Division fixed point 8bit (16 elements) - * - * @param[in] a First 8bit fixed point input vector - * @param[in] b Second 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The quotient and remainder number in 8bit fixed point format. - */ -qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); - -/** Division fixed point 16 bit (8 elements) - * - * @param[in] a First 16 bit fixed point input vector - * @param[in] b Second 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The quotient and remainder number in 16 bit fixed point format. - */ -qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); - -/** Perform a 4th degree polynomial approximation. (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit taylor approximation. - */ -template -qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position); - -/** Perform a 4th degree polynomial approximation. (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit taylor approximation. - */ -template -qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position); - -/** Perform a 4th degree polynomial approximation. (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit taylor approximation. - */ -template -qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position); - -/** Perform a 4th degree polynomial approximation. (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit taylor approximation. - */ -template -qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position); - -/** Calculate saturating exponential fixed point 8bit (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit saturating exponential - */ -qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position); - -/** Calculate saturating exponential fixed point 16 bit (4 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit saturating exponential - */ -qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position); - -/** Calculate saturating exponential fixed point 8bit (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit saturating exponential - */ -qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position); - -/** Calculate saturating exponential fixed point 16 bit (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit saturating exponential - */ -qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position); - -/** Calculate logarithm fixed point 8 bit (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit logarithm. - */ -qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position); - -/** Calculate logarithm fixed point 16 bit (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit logarithm. - */ -qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position); - -/** Calculate logarithm fixed point 16bit (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit logarithm. - */ -qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position); - -/** Calculate logarithm fixed point 16 bit (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit logarithm. - */ -qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position); - -/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit inverse sqrt. - */ -qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position); - -/** Calculate inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit inverse sqrt. - */ -qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position); - -/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit inverse sqrt. - */ -qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position); - -/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit inverse sqrt. - */ -qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position); - -/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit inverse sqrt. - */ -qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); - -/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit inverse sqrt. - */ -qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position); - -/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit inverse sqrt. - */ -qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); - -/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16 bit inverse sqrt. - */ -qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position); - -/** Calculate hyperbolic tangent for fixed point 8bit (8 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The calculated Hyperbolic Tangent. - */ -qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position); - -/** Calculate hyperbolic tangent for fixed point 16 bit (4 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The calculated Hyperbolic Tangent. - */ -qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position); - -/** Calculate hyperbolic tangent for fixed point 8bit (16 elements) - * - * @param[in] a 8bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The calculated Hyperbolic Tangent. - */ -qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position); - -/** Calculate hyperbolic tangent for fixed point 16bit (8 elements) - * - * @param[in] a 16 bit fixed point input vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The calculated Hyperbolic Tangent. - */ -qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position); - -/** Calculate saturating n power for fixed point 8bit (16 elements). - * - * pow(a,b) = e^(b*log(a)) - * - * @param[in] a 8bit fixed point input vector - * @param[in] b 8bit fixed point power vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 8bit power. - */ -qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); - -/** Calculate saturating n power for fixed point 16bit (8 elements). - * - * pow(a,b) = e^(b*log(a)) - * - * @param[in] a 16bit fixed point input vector - * @param[in] b 16bit fixed point power vector - * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number - * - * @return The result of the 16bit power. - */ -qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); - /** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements * * @param[in] a Float input vector diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index b86c3cbec3..14e51d825c 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -26,1965 +26,7 @@ namespace arm_compute { -/** Exponent polynomial coefficients for 8 bit fixed point (8 elements) - * Format is in Q0.7 for all elements - */ -static const std::array exp_tab_qs8 = -{ - { - vdup_n_s8(0x7F), // 0.9978546 - vdup_n_s8(0x3F), // 0.4994721 - vdup_n_s8(0x16), // 0.1763723 - vdup_n_s8(0x05), // 0.0435108 - } -}; - -/** Exponent polynomial coefficients for 16 bit fixed point (4 elements) - * Format is in Q0.15 for all elements - */ -static const std::array exp_tab_qs16 = -{ - { - vdup_n_s16(0x7FBA), // 0.9978546 - vdup_n_s16(0x3FE9), // 0.4994721 - vdup_n_s16(0x1693), // 0.1763723 - vdup_n_s16(0x0592), // 0.0435108 - } -}; - -/** Exponent polynomial coefficients for 8 bit fixed point (16 elements) - * Format is in Q0.7 for all elements - */ -static const std::array exp_tabq_qs8 = -{ - { - vdupq_n_s8(0x7F), // 0.9978546 - vdupq_n_s8(0x3F), // 0.4994721 - vdupq_n_s8(0x16), // 0.1763723 - vdupq_n_s8(0x05), // 0.0435108 - } -}; - -/** Exponent polynomial coefficients for 16 bit fixed point (8 elements) - * Format is in Q0.15 for all elements - */ -static const std::array exp_tabq_qs16 = -{ - { - vdupq_n_s16(0x7FBA), // 0.9978546 - vdupq_n_s16(0x3FE9), // 0.4994721 - vdupq_n_s16(0x1693), // 0.1763723 - vdupq_n_s16(0x0592), // 0.0435108 - } -}; - -/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements) - * Format is in Q0.7 for all elements except the first one which is in Q1.6 - */ -static const std::array log_tab_qs8 = -{ - { - vdup_n_s8(0x5C), // 1.4384189 - vdup_n_s8(-0x56), // -0.6771900 - vdup_n_s8(0x29), // 0.3218538 - vdup_n_s8(-0x0A), // -0.0832229 - } -}; - -/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements) - * Format is in Q0.15 for all elements except the first one which is in Q1.14 - */ -static const std::array log_tab_qs16 = -{ - { - vdup_n_s16(0x5C0F), // 1.4384189 - vdup_n_s16(-0x56AE), // -0.6771900 - vdup_n_s16(0x2933), // 0.3218538 - vdup_n_s16(-0x0AA7), // -0.0832229 - } -}; - -/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements) - * Format is in Q0.7 for all elements except the first one which is in Q1.6 - */ -static const std::array log_tabq_qs8 = -{ - { - vdupq_n_s8(0x5C), // 1.4384189 - vdupq_n_s8(-0x56), // -0.6771900 - vdupq_n_s8(0x29), // 0.3218538 - vdupq_n_s8(-0x0A), // -0.0832229 - } -}; - -/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements) - * Format is in Q0.15 for all elements except the first one which is in Q1.14 - */ -static const std::array log_tabq_qs16 = -{ - { - vdupq_n_s16(0x5C0F), // 1.4384189 - vdupq_n_s16(-0x56AE), // -0.6771900 - vdupq_n_s16(0x2933), // 0.3218538 - vdupq_n_s16(-0x0AA7), // -0.0832229 - } -}; - #ifndef DOXYGEN_SKIP_THIS -inline qint8x8_t vget_low_qs8(qint8x16_t a) -{ - return vget_low_s8(a); -} - -inline qint16x4_t vget_low_qs16(qint16x8_t a) -{ - return vget_low_s16(a); -} - -inline qint8x8_t vget_high_qs8(qint8x16_t a) -{ - return vget_high_s8(a); -} - -inline qint16x4_t vget_high_qs16(qint16x8_t a) -{ - return vget_high_s16(a); -} - -inline qint8x8_t vld1_qs8(const qint8_t *addr) -{ - return vld1_s8(addr); -} - -inline qint16x4_t vld1_qs16(const qint16_t *addr) -{ - return vld1_s16(addr); -} - -inline qint8x16_t vld1q_qs8(const qint8_t *addr) -{ - return vld1q_s8(addr); -} - -inline qint16x8_t vld1q_qs16(const qint16_t *addr) -{ - return vld1q_s16(addr); -} - -inline qint8x8_t vld1_dup_qs8(const qint8_t *addr) -{ - return vld1_dup_s8(addr); -} - -inline qint16x4_t vld1_dup_qs16(const qint16_t *addr) -{ - return vld1_dup_s16(addr); -} - -inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr) -{ - return vld1q_dup_s8(addr); -} - -inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr) -{ - return vld1q_dup_s16(addr); -} - -inline qint16x8x2_t vld2q_qs16(const qint16_t *addr) -{ - return vld2q_s16(addr); -} - -inline void vst1_qs8(qint8_t *addr, qint8x8_t b) -{ - vst1_s8(addr, b); -} - -inline void vst1_qs16(qint16_t *addr, qint16x4_t b) -{ - vst1_s16(addr, b); -} - -inline void vst1q_qs8(qint8_t *addr, qint8x16_t b) -{ - vst1q_s8(addr, b); -} - -inline void vst1q_qs16(qint16_t *addr, qint16x8_t b) -{ - vst1q_s16(addr, b); -} - -inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b) -{ - vst2q_s16(addr, b); -} - -inline qint8x8_t vqmovn_qs16(qint16x8_t a) -{ - return vqmovn_s16(a); -} - -inline qint16x4_t vqmovn_qs32(qint32x4_t a) -{ - return vqmovn_s32(a); -} - -inline qint8x8_t vdup_n_qs8(qint8_t a) -{ - return vdup_n_s8(a); -} - -inline qint16x4_t vdup_n_qs16(qint16_t a) -{ - return vdup_n_s16(a); -} - -inline qint8x16_t vdupq_n_qs8(qint8_t a) -{ - return vdupq_n_s8(a); -} - -inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position) -{ - float32x4x4_t res = - { - { - vdupq_n_f32(a), - vdupq_n_f32(a), - vdupq_n_f32(a), - vdupq_n_f32(a), - } - }; - return vqcvtq_qs8_f32(res, fixed_point_position); -} - -inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position) -{ - float32x4x2_t res = - { - { - vdupq_n_f32(a), - vdupq_n_f32(a), - } - }; - return vqcvtq_qs16_f32(res, fixed_point_position); -} - -inline qint16x8_t vdupq_n_qs16(qint16_t a) -{ - return vdupq_n_s16(a); -} - -inline qint32x4_t vdupq_n_qs32(qint32_t a) -{ - return vdupq_n_s32(a); -} - -inline qint8x8_t vabs_qs8(qint8x8_t a) -{ - return vabs_s8(a); -} - -inline qint16x4_t vabs_qs16(qint16x4_t a) -{ - return vabs_s16(a); -} - -inline qint8x16_t vabsq_qs8(qint8x16_t a) -{ - return vabsq_s8(a); -} - -inline qint16x8_t vabsq_qs16(qint16x8_t a) -{ - return vabsq_s16(a); -} - -inline qint8x8_t vqabs_qs8(qint8x8_t a) -{ - return vqabs_s8(a); -} - -inline qint16x4_t vqabs_qs16(qint16x4_t a) -{ - return vqabs_s16(a); -} - -inline qint8x16_t vqabsq_qs8(qint8x16_t a) -{ - return vqabsq_s8(a); -} - -inline qint16x8_t vqabsq_qs16(qint16x8_t a) -{ - return vqabsq_s16(a); -} - -inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b) -{ - return vmax_s8(a, b); -} - -inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b) -{ - return vmax_s16(a, b); -} - -inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b) -{ - return vmaxq_s8(a, b); -} - -inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b) -{ - return vpmax_s8(a, b); -} - -inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b) -{ - return vpmax_s16(a, b); -} - -inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b) -{ - return vmaxq_s16(a, b); -} - -inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b) -{ - return vmin_s8(a, b); -} - -inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b) -{ - return vmin_s16(a, b); -} - -inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b) -{ - return vminq_s8(a, b); -} - -inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b) -{ - return vpmin_s8(a, b); -} - -inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b) -{ - return vpmin_s16(a, b); -} - -inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b) -{ - return vminq_s16(a, b); -} - -inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b) -{ - return vadd_s8(a, b); -} - -inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b) -{ - return vadd_s16(a, b); -} - -inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b) -{ - return vaddq_s8(a, b); -} - -inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b) -{ - return vaddq_s16(a, b); -} - -inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b) -{ - return vqadd_s8(a, b); -} - -inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b) -{ - return vqadd_s16(a, b); -} - -inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b) -{ - return vqadd_s32(a, b); -} - -inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b) -{ - return vqaddq_s8(a, b); -} - -inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b) -{ - return vqaddq_s16(a, b); -} - -inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b) -{ - return vqaddq_s32(a, b); -} - -inline int16x4_t vpaddl_qs8(qint8x8_t a) -{ - return vpaddl_s8(a); -} - -inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b) -{ - return vsub_s8(a, b); -} - -inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b) -{ - return vsub_s16(a, b); -} - -inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b) -{ - return vsubq_s8(a, b); -} - -inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b) -{ - return vsubq_s16(a, b); -} - -inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b) -{ - return vqsub_s8(a, b); -} - -inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b) -{ - return vqsub_s16(a, b); -} - -inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b) -{ - return vqsubq_s8(a, b); -} - -inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b) -{ - return vqsubq_s16(a, b); -} - -inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary result with a constant used to round up the result - qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - res = vmlal_s8(res, a, b); - - // Shift right by fixed_point_position - res = vshlq_s16(res, fixed_point_position_s16); - - // Convert back to qint8 - return vmovn_s16(res); -} - -inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary result with a constant used to round up the result - qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - res = vmlal_s16(res, a, b); - - // Shift right by fixed_point_position - res = vshlq_s32(res, fixed_point_position_s32); - - // Convert back to qint16 - return vmovn_s32(res); -} - -inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1)); - qint16x8_t res1 = res0; - - // Vector multiply-accumulate long - res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b)); - res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b)); - - // Shift right by fixed_point_position - res0 = vshlq_s16(res0, fixed_point_position_s16); - res1 = vshlq_s16(res1, fixed_point_position_s16); - - // Convert back to qint8 - return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1)); -} - -inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1)); - qint32x4_t res1 = res0; - - // Vector multiply-accumulate long - res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b)); - res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b)); - - // Shift right by fixed_point_position - res0 = vshlq_s32(res0, fixed_point_position_s32); - res1 = vshlq_s32(res1, fixed_point_position_s32); - - // Convert back to qint16 - return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); -} - -inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary result with a constant used to round up the result - qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - res = vmlal_s8(res, a, b); - - // Shift right by fixed_point_position - res = vqshlq_s16(res, fixed_point_position_s16); - - // Convert back to qint8 and saturate - return vqmovn_s16(res); -} - -inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary result with a constant used to round up the result - qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - res = vmlal_s16(res, a, b); - - // Shift right by fixed_point_position - res = vqshlq_s32(res, fixed_point_position_s32); - - // Convert back to qint16 and saturate - return vqmovn_s32(res); -} - -inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1)); - qint16x8_t res1 = res0; - - // Vector multiply-accumulate long - res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b)); - res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b)); - - // Shift right by fixed_point_position - res0 = vqshlq_s16(res0, fixed_point_position_s16); - res1 = vqshlq_s16(res1, fixed_point_position_s16); - - // Convert back to qint8 and saturate - return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1)); -} - -inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1)); - qint32x4_t res1 = res0; - - // Vector multiply-accumulate long - res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b)); - res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b)); - - // Shift right by fixed_point_position - res0 = vqshlq_s32(res0, fixed_point_position_s32); - res1 = vqshlq_s32(res1, fixed_point_position_s32); - - // Convert back to qint16 and saturate - return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1)); -} - -inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - qint16x8_t res = vmull_s8(a, b); - - return vqrshlq_s16(res, fixed_point_position_s16); -} - -inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmull_s16(a, b); - - // Shift right by fixed_point_position - return vqshlq_s32(tmp, fixed_point_position_s32); -} - -inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s8(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vshlq_s16(tmp, fixed_point_position_s16); - - // Convert back to qint8 and accumulate - return vadd_s8(a, vmovn_s16(tmp)); -} - -inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s16(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vshlq_s32(tmp, fixed_point_position_s32); - - // Convert back to qint16 and accumulate - return vadd_s16(a, vmovn_s32(tmp)); -} - -inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1)); - qint16x8_t tmp1 = tmp0; - - // Vector multiply-accumulate long - tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c)); - tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c)); - - // Shift right by fixed_point_position - tmp0 = vshlq_s16(tmp0, fixed_point_position_s16); - tmp1 = vshlq_s16(tmp1, fixed_point_position_s16); - - // Convert back to qint8 and accumulate - return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1))); -} - -inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1)); - qint32x4_t tmp1 = tmp0; - - // Vector multiply-accumulate long - tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c)); - tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c)); - - // Shift right by fixed_point_position - tmp0 = vshlq_s32(tmp0, fixed_point_position_s32); - tmp1 = vshlq_s32(tmp1, fixed_point_position_s32); - - // Convert back to qint16 and accumulate - return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1))); -} - -inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s8(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vqshlq_s16(tmp, fixed_point_position_s16); - - // Convert back to qint8 and accumulate - return vqadd_s8(a, vqmovn_s16(tmp)); -} - -inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s16(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vqshlq_s32(tmp, fixed_point_position_s32); - - // Convert back to qint8 and accumulate - return vqadd_s16(a, vqmovn_s32(tmp)); -} - -inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1)); - qint16x8_t tmp1 = tmp0; - - // Vector multiply-accumulate long - tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c)); - tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c)); - - // Shift right by fixed_point_position - tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16); - tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16); - - // Convert back to qint8 and accumulate - qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1)); - return vqaddq_s8(a, res); -} - -inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1)); - qint32x4_t tmp1 = tmp0; - - // Vector multiply-accumulate long - tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c)); - tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c)); - - // Shift right by fixed_point_position - tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32); - tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32); - - // Convert back to qint16 and accumulate - qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1)); - return vqaddq_s16(a, res); -} - -inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s8(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vshlq_s16(tmp, fixed_point_position_s16); - - // Accumulate - return vaddq_s16(a, tmp); -} - -inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s16(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vshlq_s32(tmp, fixed_point_position_s32); - - // Accumulate - return vaddq_s32(a, tmp); -} - -inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) -{ - const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s8(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vqshlq_s16(tmp, fixed_point_position_s16); - - // Accumulate - return vqaddq_s16(a, tmp); -} - -inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position) -{ - const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); - - // Initialize the temporary results with a constant used to round up the result - qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); - - // Vector multiply-accumulate long - tmp = vmlal_s16(tmp, b, c); - - // Shift right by fixed_point_position - tmp = vqshlq_s32(tmp, fixed_point_position_s32); - - // Accumulate - return vqaddq_s32(a, tmp); -} - -inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(static_cast(1 << fixed_point_position)); - - float32x4x2_t res_f32 = - { - { - vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), - vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)) - } - }; - - res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2); - res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2); - - const int32x4x2_t res_s32 = - { - { - vcvtq_s32_f32(res_f32.val[0]), - vcvtq_s32_f32(res_f32.val[1]), - } - }; - - const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])); - - return vqmovn_s16(res_s16); -} - -inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(static_cast(1 << fixed_point_position)); - - float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)); - - res_f32 = vmlaq_f32(res_f32, a, pow2); - - const int32x4_t res_s32 = vcvtq_s32_f32(res_f32); - - return vqmovn_s32(res_s32); -} - -inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(static_cast(1 << fixed_point_position)); - - float32x4x4_t res_f32 = - { - { - vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), - vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), - vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), - vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)) - } - }; - - res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2); - res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2); - res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2); - res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2); - - const int32x4x4_t res_s32 = - { - { - vcvtq_s32_f32(res_f32.val[0]), - vcvtq_s32_f32(res_f32.val[1]), - vcvtq_s32_f32(res_f32.val[2]), - vcvtq_s32_f32(res_f32.val[3]), - } - }; - - const int16x8x2_t res_s16 = - { - { - vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])), - vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])), - } - }; - - return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1])); -} - -inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(static_cast(1 << fixed_point_position)); - - float32x4x2_t res_f32 = - { - { - vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), - vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)) - } - }; - - res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2); - res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2); - - const int32x4x2_t res_s32 = - { - { - vcvtq_s32_f32(res_f32.val[0]), - vcvtq_s32_f32(res_f32.val[1]) - } - }; - - return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])); -} - -inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position)); - - const int16x8_t res_s16 = vmovl_s8(a); - - const int32x4x2_t res_s32 = - { - { - vmovl_s16(vget_low_qs16(res_s16)), - vmovl_s16(vget_high_qs16(res_s16)) - } - }; - - float32x4x2_t res_f32 = - { - { - vcvtq_f32_s32(res_s32.val[0]), - vcvtq_f32_s32(res_s32.val[1]) - } - }; - - res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2); - res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2); - - return res_f32; -} - -inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position)); - const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a)); - - return vmulq_f32(res_f32, pow2); -} - -inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position)); - - const int16x8x2_t res_s16 = - { - { - vmovl_s8(vget_low_s8(a)), - vmovl_s8(vget_high_s8(a)), - } - }; - - const int32x4x4_t res_s32 = - { - { - vmovl_s16(vget_low_qs16(res_s16.val[0])), - vmovl_s16(vget_high_qs16(res_s16.val[0])), - vmovl_s16(vget_low_qs16(res_s16.val[1])), - vmovl_s16(vget_high_qs16(res_s16.val[1])), - } - }; - - float32x4x4_t res_f32 = - { - { - vcvtq_f32_s32(res_s32.val[0]), - vcvtq_f32_s32(res_s32.val[1]), - vcvtq_f32_s32(res_s32.val[2]), - vcvtq_f32_s32(res_s32.val[3]) - } - }; - - res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2); - res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2); - res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2); - res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2); - - return res_f32; -} - -inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position) -{ - const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position)); - - const int32x4x2_t res_s32 = - { - { - vmovl_s16(vget_low_qs16(a)), - vmovl_s16(vget_high_qs16(a)) - } - }; - - float32x4x2_t res_f32 = - { - { - vcvtq_f32_s32(res_s32.val[0]), - vcvtq_f32_s32(res_s32.val[1]) - } - }; - - res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2); - res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2); - - return res_f32; -} - -inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 - const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823 - const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823 - const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); - const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position); - - // Find shift value - const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); - const qint8x8_t temp = vshl_s8(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position)); - - uint8x8_t set_one = vcgt_s8(x, const_one); - x = vbsl_s8(set_one, const_one, x); - - // Use three iterations of Newton-Raphson method to get the result - x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position); - - return vshl_s8(x, shift_value); -} - -inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0 - const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823 - const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823 - const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position); - const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position); - - // Find shift value - const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)))); - const qint16x4_t temp = vshl_s16(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position)); - - uint16x4_t set_one = vcgt_s16(x, const_one); - x = vbsl_s16(set_one, const_one, x); - - // Use four iterations of Newton-Raphson method to get the result - x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - - return vshl_s16(x, shift_value); -} - -inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 - const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823 - const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823 - const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); - const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position); - - // Find shift value - const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); - const qint8x8_t temp = vqshl_s8(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position)); - - uint8x8_t set_one = vcgt_s8(x, const_one); - x = vbsl_s8(set_one, const_one, x); - - // Use three iterations of Newton-Raphson method to get the result - x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position); - - return vqshl_s8(x, shift_value); -} - -inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0 - const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823 - const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823 - const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position); - const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position); - - // Find shift value - const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)))); - const qint16x4_t temp = vqshl_s16(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position)); - - uint16x4_t set_one = vcgt_s16(x, const_one); - x = vbsl_s16(set_one, const_one, x); - - // Use four iterations of Newton-Raphson method to get the result - x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position); - - return vqshl_s16(x, shift_value); -} - -inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 - const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823 - const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823 - const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); - const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position); - - // Find shift value - const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); - const qint8x16_t temp = vshlq_s8(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position)); - - // Set initial guess to one if x > 1 - uint8x16_t set_one = vcgtq_s8(x, const_one); - x = vbslq_s8(set_one, const_one, x); - - // Use three iterations of Newton-Raphson method to get the result - x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position); - - return vshlq_s8(x, shift_value); -} - -inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0 - const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823 - const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823 - const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position); - const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position); - - // Find shift value - const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)))); - const qint16x8_t temp = vshlq_s16(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position)); - - // Set initial guess to one if x > 1 - uint16x8_t set_one = vcgtq_s16(x, const_one); - x = vbslq_s16(set_one, const_one, x); - - // Use four iterations of Newton-Raphson method to get the result - x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - - return vshlq_s16(x, shift_value); -} - -inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 - const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823 - const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823 - const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); - const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position); - - // Find shift value - const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); - const qint8x16_t temp = vqshlq_s8(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position)); - - // Set initial guess to one if x > 1 - uint8x16_t set_one = vcgtq_s8(x, const_one); - x = vbslq_s8(set_one, const_one, x); - - // Use three iterations of Newton-Raphson method to get the result - x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position); - x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position); - - return vqshlq_s8(x, shift_value); -} - -inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position) -{ - // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0 - const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823 - const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823 - const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position); - const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position); - - // Find shift value - const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)))); - const qint16x8_t temp = vqshlq_s16(a, shift_value); - - // Newton-Raphson division initial estimate X0 calculation - qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position)); - - // Set initial guess to one if x > 1 - uint16x8_t set_one = vcgtq_s16(x, const_one); - x = vbslq_s16(set_one, const_one, x); - - // Use four iterations of Newton-Raphson method to get the result - x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position); - - // Saturate result in case of overflow - return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits::max()), vqshlq_s16(x, shift_value)); -} - -inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) -{ - return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position); -} - -inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position) -{ - return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position); -} - -inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) -{ - return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position); -} - -inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position) -{ - return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position); -} - -template -inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position)); - const qint8x8_t const_one = vdup_n_s8(1); - const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value); - const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value); - const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value); - const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value); - const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C); - const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B); - const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A); - const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position); - return res; -} - -template -inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position)); - const qint16x4_t const_one = vdup_n_s16(1); - const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value); - const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value); - const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value); - const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value); - const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C); - const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B); - const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A); - const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position); - return res; -} - -template -inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position)); - const qint8x8_t const_one = vdup_n_s8(1); - const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value); - const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value); - const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value); - const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value); - const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C); - const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B); - const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A); - const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position); - return res; -} - -template -inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position)); - const qint16x4_t const_one = vdup_n_s16(1); - const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value); - const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value); - const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value); - const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value); - const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C); - const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B); - const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A); - const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position); - return res; -} - -template -inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position)); - const qint8x16_t const_one = vdupq_n_s8(1); - const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value); - const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value); - const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value); - const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value); - const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C); - const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B); - const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A); - const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position); - return res; -} - -template -inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position)); - const qint16x8_t const_one = vdupq_n_s16(1); - const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value); - const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value); - const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value); - const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value); - const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C); - const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B); - const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A); - const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position); - return res; -} - -template -inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position)); - const qint8x16_t const_one = vdupq_n_s8(1); - const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value); - const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value); - const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value); - const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value); - const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C); - const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B); - const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A); - const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position); - return res; -} - -template -inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position)); - const qint16x8_t const_one = vdupq_n_s16(1); - const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value); - const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value); - const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value); - const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value); - const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C); - const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B); - const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A); - const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position); - return res; -} - -inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7); - const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); - const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2) - const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2) - - // Perform range reduction [-log(2),log(2)] - const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2) - - // get decimal part from m - const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position)); - - qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position); - alpha = vqabs_qs8(vqsub_s8(a, alpha)); - - // Polynomial Approximation - qint8x8_t poly = vqtaylor_poly_qs8(alpha, fixed_point_position); - poly = vqadd_s8(poly, const_one); - - // Reconstruct - poly = vqshl_s8(poly, dec_m); - - return poly; -} - -inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15); - const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position); - const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2) - const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2) - - // Perform range reduction [-log(2),log(2)] - const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2) - - // get decimal part from m - const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position)); - - qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position); - alpha = vqabs_qs16(vqsub_s16(a, alpha)); - - // Polynomial Approximation - qint16x4_t poly = vqtaylor_poly_qs16(alpha, fixed_point_position); - poly = vqadd_s16(poly, const_one); - - // Reconstruct - poly = vqshl_s16(poly, dec_m); - - return poly; -} - -inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7); - const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); - const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2) - const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2) - - // Perform range reduction [-log(2),log(2)] - const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2) - - // get decimal part from m - const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position)); - - qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position); - alpha = vqabsq_qs8(vqsubq_qs8(a, alpha)); - - // Polynomial Approximation - qint8x16_t poly = vqtaylor_polyq_qs8(alpha, fixed_point_position); - poly = vqaddq_s8(poly, const_one); - - // Reconstruct - poly = vqshlq_s8(poly, dec_m); - - return poly; -} - -inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15); - const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position); - const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2) - const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2) - - // Perform range reduction [-log(2),log(2)] - const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2) - - // get decimal part from m - const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position)); - - qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position); - alpha = vqabsq_qs16(vqsubq_qs16(a, alpha)); - - // Polynomial Approximation - qint16x8_t poly = vqtaylor_polyq_qs16(alpha, fixed_point_position); - poly = vqaddq_s16(poly, const_one); - - // Reconstruct - poly = vqshlq_s16(poly, dec_m); - - return poly; -} - -inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); - const qint8x8_t const_seven_dec = vdup_n_s8(7); - const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2) - - // If 0 < a < 1, calculate log(1/x) - uint8x8_t calc_reciprocal = vclt_s8(a, const_one); - qint8x8_t recip = vdup_n_s8(0); - recip = vbsl_s8(calc_reciprocal, recip, a); - - // Calculate reciprocal - recip = vrecip_qs8(recip, fixed_point_position); - a = vbsl_s8(calc_reciprocal, recip, a); - - // Get decimal part of a - qint8x8_t shift_value = vdup_n_s8(-fixed_point_position); - qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position - - // Get exponent of 2^n which is equal or less than dec_a - shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a)); - - // Get x to range (1, 2] - const qint8x8_t shift_value_neg = vneg_s8(shift_value); - const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one); - const qint8x8_t sum = vmul_s8(shift_value, const_one); - - // Polynomial Approximation - qint8x8_t poly = vtaylor_poly_qs8(temp, fixed_point_position); - - // Reconstruct - poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position); - - // Set negative value for 0 < a < 1 - poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly); - - return poly; -} - -inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position); - const qint16x4_t const_fifteen_dec = vdup_n_s16(15); - const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2) - - // If 0 < a < 1, calculate log(1/x) - uint16x4_t calc_reciprocal = vclt_s16(a, const_one); - qint16x4_t recip = vdup_n_s16(0); - recip = vbsl_s16(calc_reciprocal, recip, a); - - // Calculate reciprocal - recip = vrecip_qs16(recip, fixed_point_position); - a = vbsl_s16(calc_reciprocal, recip, a); - - // Get decimal part of a - qint16x4_t shift_value = vdup_n_s16(-fixed_point_position); - qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position - - // Get exponent of 2^n which is equal or less than dec_a - shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a)); - - // Get x to range (1, 2] - const qint16x4_t shift_value_neg = vneg_s16(shift_value); - const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one); - const qint16x4_t sum = vmul_s16(shift_value, const_one); - - // Polynomial Approximation - qint16x4_t poly = vtaylor_poly_qs16(temp, fixed_point_position); - - // Reconstruct - poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position); - - // Set negative value for 0 < a < 1 - poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly); - - return poly; -} - -inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); - const qint8x16_t const_seven_dec = vdupq_n_s8(7); - const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2) - - // If 0 < a < 1, calculate log(1/x) - uint8x16_t calc_reciprocal = vcltq_s8(a, const_one); - qint8x16_t recip = vdupq_n_s8(0); - recip = vbslq_s8(calc_reciprocal, a, recip); - - // Calculate reciprocal - recip = vrecipq_qs8(recip, fixed_point_position); - a = vbslq_s8(calc_reciprocal, recip, a); - - // Get decimal part of a - qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position); - qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position - - // Get exponent of 2^n which is equal or less than dec_a - shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a)); - - // Get x to range (1, 2] - const qint8x16_t shift_value_neg = vnegq_s8(shift_value); - const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one); - const qint8x16_t sum = vmulq_s8(shift_value, const_one); - - // Polynomial Approximation - qint8x16_t poly = vtaylor_polyq_qs8(temp, fixed_point_position); - - // Reconstruct - poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position); - - // Set negative value for 0 < a < 1 - poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly); - - return poly; -} - -inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position); - const qint16x8_t const_fifteen_dec = vdupq_n_s16(15); - const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2) - - // If 0 < a < 1, calculate log(1/x) - uint16x8_t calc_reciprocal = vcltq_s16(a, const_one); - qint16x8_t recip = vdupq_n_s16(0); - recip = vbslq_s16(calc_reciprocal, a, recip); - - // Calculate reciprocal - recip = vqrecipq_qs16(recip, fixed_point_position); - a = vbslq_s16(calc_reciprocal, recip, a); - - // Get decimal part of a - qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position); - qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position - - // Get exponent of 2^n which is equal or less than dec_a - shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a)); - - // Get x to range (1, 2] - const qint16x8_t shift_value_neg = vnegq_s16(shift_value); - const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one); - const qint16x8_t sum = vmulq_s16(shift_value, const_one); - - // Polynomial Approximation - qint16x8_t poly = vtaylor_polyq_qs16(temp, fixed_point_position); - - // Reconstruct - poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position); - - // Set negative value for 0 < a < 1 - poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly); - - return poly; -} - -inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))); - uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0)); - temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp); - qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1)); - - temp = vshl_s8(a, shift_value); - - // Initial guess - qint8x8_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After three iterations we have the result for 8 bit - x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vshl_s8(x, shift_value2); -} - -inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))); - uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0)); - temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp); - qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1)); - - temp = vshl_s16(a, shift_value); - - // Initial guess - qint16x4_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After five iterations we have the result for 8 bit - x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vshl_s16(x, shift_value2); -} - -inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))); - uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0)); - temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp); - qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1)); - - temp = vqshl_s8(a, shift_value); - - // Initial guess - qint8x8_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After three iterations we have the result for 8 bit - x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vqshl_s8(x, shift_value2); -} - -inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))); - uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0)); - temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp); - qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1)); - - temp = vqshl_s16(a, shift_value); - - // Initial guess - qint16x4_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After five iterations we have the result for 16 bit - x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vqshl_s16(x, shift_value2); -} - -inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))); - uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0)); - temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp); - qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1)); - - temp = vshlq_s8(a, shift_value); - - // Initial guess - qint8x16_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After three iterations we have the result for 8 bit - x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vshlq_s8(x, shift_value2); -} - -inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))); - uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0)); - temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp); - qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1)); - - temp = vshlq_s16(a, shift_value); - - // Initial guess - qint16x8_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After five iterations we have the result for 16 bit - x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vshlq_s16(x, shift_value2); -} - -inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))); - uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0)); - temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp); - qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1)); - - temp = vqshlq_s8(a, shift_value); - - // Initial guess - qint8x16_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After three iterations we have the result for 8 bit - x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vqshlq_s8(x, shift_value2); -} - -inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position); - - // Find shift value. Number must be in (0.5, 2) range. - qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)))); - - // Add one when the shift value is negative in order to get the correct result when we shift right with 1 - qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))); - uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0)); - temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp); - qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1)); - - temp = vqshlq_s16(a, shift_value); - - // Initial guess - qint16x8_t x = temp; - - // Calculate (x / 2) * (3 - a * x^2) - // After five iterations we have the result for 16 bit - x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); - - return vqshlq_s16(x, shift_value2); -} - -inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position) -{ - const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); - const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position); - - const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position); - const qint8x8_t num = vqsub_qs8(exp2x, const_one); - const qint8x8_t den = vqadd_qs8(exp2x, const_one); - const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position); - - return tanh; -} - -inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position) -{ - const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position); - const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position); - - const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position); - const qint16x4_t num = vqsub_qs16(exp2x, const_one); - const qint16x4_t den = vqadd_qs16(exp2x, const_one); - const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position); - - return tanh; -} - -inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position) -{ - const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); - const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position); - - const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position); - const qint8x16_t num = vqsubq_qs8(exp2x, const_one); - const qint8x16_t den = vqaddq_qs8(exp2x, const_one); - const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position); - - return tanh; -} - -inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position) -{ - const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position); - const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position); - - const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position); - const qint16x8_t num = vqsubq_qs16(exp2x, const_one); - const qint16x8_t den = vqaddq_qs16(exp2x, const_one); - const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position); - - return tanh; -} - -inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) -{ - return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position); -} - -inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position) -{ - return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position); -} inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b) { diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 06a0a01782..0290e32085 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -24,7 +24,6 @@ #ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ #define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ -#include "arm_compute/core/FixedPoint.h" #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/QAsymm8.h" @@ -59,7 +58,7 @@ public: * @note If the output tensor is a nullptr, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * of the activation function. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] activation_info Activation layer information. */ @@ -67,7 +66,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * of the activation function. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data type supported: same as @p input * @param[in] act_info Activation layer information. * @@ -99,24 +98,12 @@ private: template typename std::enable_if::value, void>::type activation(const Window &window); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template - typename std::enable_if::value, void>::type activation(const Window &window); /** Function to apply an activation function on a tensor. * * @param[in] window Region on which to execute the kernel */ template typename std::enable_if::value, void>::type activation(const Window &window); - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template - typename std::enable_if::value, void>::type activation(const Window &window); private: ITensor *_input; diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index 155e792f5d..8cf21eae9d 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -57,26 +57,24 @@ public: * Valid configurations (Input1,Input2) -> Output : * * - (U8,U8) -> U8 - * - (QS8,QS8) -> QS8 * - (U8,U8) -> S16 * - (S16,U8) -> S16 * - (U8,S16) -> S16 * - (S16,S16) -> S16 - * - (QS16,QS16) -> QS16 * - (F16,F16) -> F16 * - (F32,F32) -> F32 * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] policy Overflow policy. * * @return a status @@ -90,9 +88,9 @@ public: private: /** Common signature for all the specialised add functions * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] window Region on which to execute the kernel. */ using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h index 73ecfcfeb5..3e93922b65 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -57,26 +57,24 @@ public: * Valid configurations (Input1,Input2) -> Output : * * - (U8,U8) -> U8 - * - (QS8,QS8) -> QS8 * - (U8,U8) -> S16 * - (S16,U8) -> S16 * - (U8,S16) -> S16 * - (S16,S16) -> S16 - * - (QS16,QS16) -> QS16 * - (F16,F16) -> F16 * - (F32,F32) -> F32 * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel * - * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32 + * @param[in] output Output tensor. Data types supported: U8/S16/F16/F32 * @param[in] policy Policy to use to handle overflow. * * @return a status @@ -89,9 +87,9 @@ public: private: /** Common signature for all the specialised sub functions * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] window Region on which to execute the kernel. */ using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 2d33f87dfa..2a540c151b 100644 --- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -57,7 +57,7 @@ public: * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input @@ -72,7 +72,7 @@ public: * * @param[in] input Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input @@ -96,22 +96,7 @@ private: void configure_non_fused(); /** Configure execution function in case of fused activation **/ void configure_fused(); - /** Template function to run batch normalization on 8-bit fixed point - * - * @tparam fused_activation Boolean that flags if its a fused activation or not - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void batch_normalization_qs8(const Window &window); - /** Template function to run batch normalization on 16-bit fixed point - * - * @tparam fused_activation Boolean that flags if its a fused activation or not - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void batch_normalization_qs16(const Window &window); + /** Template function to run batch normalization on fp16 * * @tparam fused_activation Boolean that flags if its a fused activation or not diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h index 9fb493cc4f..f02858e7d9 100644 --- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h +++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h @@ -72,7 +72,7 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. @@ -80,7 +80,7 @@ public: void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims); /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h index 65ce764246..d5c9e3bbe9 100644 --- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h +++ b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h @@ -59,7 +59,7 @@ public: ~NEConvertFullyConnectedWeightsKernel() = default; /** Set the input and output tensor. * - * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32. + * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32. * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format. * @param[in] data_layout The data layout the weights have been trained in. @@ -67,7 +67,7 @@ public: void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel * - * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32. + * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32. * @param[in] output The converted weights tensor info. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format. * @param[in] data_layout The data layout the weights have been trained in. diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h index 67ef5293b7..12a5051ef8 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h @@ -55,7 +55,7 @@ public: ~NEDepthConcatenateLayerKernel() = default; /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] input Input tensor. Data types supported: F16/F32. * @param[in] depth_offset The offset on the Z axis. * @param[in,out] output Output tensor. Data types supported: Same as @p input. * diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h index 50536f2b47..77bb0413ca 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h @@ -55,19 +55,12 @@ public: * * Valid conversions Input -> Output : * - * - QS8 -> QS8, F32 * - U8 -> U16, S16, S32 * - U16 -> U8, U32 * - S16 -> U8, S32 - * - QS16 -> QS16, F32 - * - F32 -> QS8 * - * @warning In case of in-place fixed point position conversion make sure that configure has been called - * before the updated tensor is used in other functions, as the TensorInfo of the tensor will be - * altered. In-place is only supported for QS8 -> QS8, QS16 -> QS16. - * - * @param[in, out] input The input tensor to convert (Written in case of in-place computation). Data types supported: U8/QS8/U16/S16/F32. - * @param[out] output The output tensor. Can be null in case of in-place computation. Data types supported: U8/QS8/U16/S16/U32/S32/F32. + * @param[in, out] input The input tensor to convert (Written in case of in-place computation). Data types supported: U8/U16/S16. + * @param[out] output The output tensor. Can be null in case of in-place computation. Data types supported: U8/U16/S16/U32/S32/F32. * @param[in] policy Conversion policy. * @param[in] shift (Optional) Value for down/up conversions. Must be 0 <= shift < 8. * In case of fixed point position conversion, it specifies the new fixed point position, if operation is in-place. @@ -82,8 +75,6 @@ private: ITensor *_output; ConvertPolicy _policy; uint32_t _shift; - int _fixed_point_position_input; - int _fixed_point_position_output; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h index f859f97dae..589725ab01 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -57,24 +57,24 @@ public: * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. * @param[out] output Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32 + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. * @param[in] output Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32 + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS32/F16/F32 * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h index 77711d7ecd..7fd1d70374 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h @@ -55,10 +55,10 @@ public: /** Set the accumulate buffer and the biases of the kernel. * * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS16/QS32/F16/F32 + * Data type supported: QS32/F16/F32 * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Data type supported: QS8/QS16/F16/F32 + * Data type supported: F16/F32 * @param[in] result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add * @param[in] result_shift (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication * @param[in] result_offset_after_shift (Optional)Offset to be applied to result before converting it back to QASYMM8 @@ -68,10 +68,10 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel * * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS16/QS32/F16/F32 + * Data type supported: QS32/F16/F32 * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Data type supported: QS8/QS16/F16/F32 + * Data type supported: F16/F32 * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr); diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h index dd19b8f35a..cff6b4ea2d 100644 --- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h +++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h @@ -57,7 +57,7 @@ public: * * @note This kernel fills the borders within the XY-planes. * - * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QS8/QASYMM8/QS16/S16/S32/F32. + * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QASYMM8/S16/S32/F32. * @param[in] border_size Size of the border to fill in elements. * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h index 545a265dc2..2b6c7af72a 100644 --- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h +++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h @@ -57,7 +57,7 @@ public: * * @note This kernel fills the borders within the XY-planes. * - * @param[in,out] input Tensor to process. Data types supported: U8/QS8/S16/S32/F32. + * @param[in,out] input Tensor to process. Data types supported: U8/S16/S32/F32. * @param[in] border_size Size of the border to fill in elements. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index 79504fd4da..5c0104d138 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -60,13 +60,13 @@ public: NEGEMMInterleave4x4Kernel(); /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * * @return a status @@ -79,7 +79,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h index e48a9a77e4..419a9f9150 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h @@ -51,13 +51,13 @@ public: ~NEGEMMMatrixAccumulateBiasesKernel() = default; /** Set the accumulate buffer and the biases of the kernel. * - * @param[in, out] accum The accumulate tensor to convert. Data type supported: QS8/QS16/F32 + * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32 * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input */ void configure(ITensor *accum, const ITensor *biases); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel * - * @param[in] accum The accumulate tensor to convert. Data type supported: QS8/QS16/F32 + * @param[in] accum The accumulate tensor to convert. Data type supported: F32 * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h index 5e4f8b72ff..1a235933dc 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h @@ -59,7 +59,7 @@ public: * * @note The input and output tensor must have the same dimensions * - * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32 + * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32 * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input. * @param[in] beta Weight of matrix C */ @@ -71,7 +71,7 @@ public: private: /** Common signature for all the matrix addition functions * - * @param[in] input An input tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input An input tensor. Data types supported: F16/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. * @param[in] beta Weight of matrix C diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h index d54522c678..6ee958205e 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h @@ -58,7 +58,7 @@ public: * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel * These two kernels change the layout of the original matrices to be more cache-friendly. * - * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. @@ -69,7 +69,7 @@ public: void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel * - * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h index fcdd8dd93c..b7fbfcfcd2 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h @@ -74,13 +74,13 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor info. Data type supported: same as @p input. * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h index 5aa803f4fd..d455fd98b3 100644 --- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h @@ -77,7 +77,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * Note: QASYMM8 works only for has_bias = false * @param[out] output The output tensor. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -92,7 +92,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * Note: QASYMM8 works only for has_bias = false * @param[in] output The output tensor. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h index 6ae7b73423..92086437a6 100644 --- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h @@ -54,7 +54,7 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], * Data type supported: same as @p input * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input @@ -64,7 +64,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], * Data type supported: same as @p input * @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input @@ -92,18 +92,6 @@ private: template void normalize_float(const Window &window); - /** Function to perform normalization for fixed-point values depending on - * the given template dimension. The second template parameter specifies - * whether the normalization has to be 1D or 2D. - * - * @note Only supported normalizations are: - * - 1D over X or Z - * - 2D over X and Y - * - * @param[in] window Region on which to execute the kernel. - */ - template - void normalize_fixed_point(const Window &window); /** Common signature for all the specialised normalization functions * * @param[in] window Region on which to execute the kernel. diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h index 68bbdcb3cb..b56faa8514 100644 --- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h +++ b/arm_compute/core/NEON/kernels/NEPermuteKernel.h @@ -58,7 +58,7 @@ public: * * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0] * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector */ @@ -67,7 +67,7 @@ public: * * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0] * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector * diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index 8c245569a5..41ea91495f 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -55,11 +55,10 @@ public: * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * For QS8/QS16 scale = 1 is the only supported value. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. @@ -70,11 +69,10 @@ public: * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * For QS8/QS16 scale = 1 is the only supported value. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. @@ -96,15 +94,6 @@ private: * @param[out] output_ptr Pointer to the output tensor. */ using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale); - /** Common signature for all the specialised multiplication functions with fixed-point values - * - * @param[in] input1_ptr Pointer to the first input tensor. - * @param[in] input2_ptr Pointer to the second input tensor. - * @param[in] scale Scaling factor. - * @param[in] fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number. - * @param[out] output_ptr Pointer to the output tensor. - */ - using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position); /** Common signature for all the specialised multiplication functions with float scaling factor * * @param[in] input1_ptr Pointer to the first input tensor. @@ -115,7 +104,6 @@ private: MulFunctionFloat *_func_float; MulFunctionInt *_func_int; - MulFunctionQInt *_func_q_int; private: const ITensor *_input1; diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h index 4140ccf1ed..6c4c1db289 100644 --- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -52,18 +52,18 @@ public: ~NEPoolingLayerKernel() = default; /** Set the input and output tensors. * - * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only + * @note F16 are supported for pool sizes 2 and 3 only * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel * - * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only + * @note F16 are supported for pool sizes 2 and 3 only * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @@ -90,13 +90,6 @@ private: */ template void pooling2_f16_nchw(const Window &window_input, const Window &window); - /** Function to perform 2x2 pooling for 8bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling2_q8_nchw(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for 8bit asymmetric fixed point. * * @param[in] window_input Input region on which to execute the kernel. @@ -104,13 +97,6 @@ private: */ template void pooling2_qasymm8_nchw(const Window &window_input, const Window &window); - /** Function to perform 2x2 pooling for 16bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling2_q16_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling. * * @param[in] window_input Input region on which to execute the kernel. @@ -125,13 +111,6 @@ private: */ template void pooling3_f16_nchw(const Window &window_input, const Window &window); - /** Function to perform 3x3 pooling for 8bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling3_q8_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 8bit quantized fixed point. * * @param[in] window_input Input region on which to execute the kernel. @@ -139,13 +118,6 @@ private: */ template void pooling3_qasymm8_nchw(const Window &window_input, const Window &window); - /** Function to perform 3x3 pooling for 16bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling3_q16_nchw(const Window &window_input, const Window &window); /** Function to perform 7x7 pooling. * * @param[in] window_input Input region on which to execute the kernel. @@ -153,13 +125,6 @@ private: */ template void pooling7_f32_nchw(const Window &window_input, const Window &window); - /** Function to perform MxN pooling for 8bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void poolingMxN_q8_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 8-bit quantized. * * @param[in] window_input Input region on which to execute the kernel. @@ -174,13 +139,6 @@ private: */ template void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window); - /** Function to perform MxN pooling for 16bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void poolingMxN_q16_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 16-bit floating point values. * * @param[in] window_input Input region on which to execute the kernel. diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h index 0a3fc44881..08b4e11189 100644 --- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h @@ -40,7 +40,7 @@ public: } /** Set the input and output of the kernel * - * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/U16/S16/QASYMM8/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h index c30a4cd23d..25c3196e34 100644 --- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h @@ -43,13 +43,13 @@ public: NELogits1DMaxKernel(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types supported: same as @p input */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel * - * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor. Data types supported: same as @p input * * @return a status @@ -90,7 +90,7 @@ public: ~NELogits1DSoftmaxKernel() = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[in] max Max values tensor. Same shape as input with dimension 0 set to 1. * Data types supported: same as @p input. * @param[out] output Destination tensor. Data types supported: same as @p input. @@ -101,7 +101,7 @@ public: void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp); /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. * Data types supported: same as @p input. * @param[in] output Destination tensor info. Data types supported: same as @p input. diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h index dc7ef8ff7a..76823acfa1 100644 --- a/arm_compute/core/NEON/kernels/NETransposeKernel.h +++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h @@ -57,13 +57,13 @@ public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor. Data type supported: Same as @p input * * @return a status @@ -76,7 +76,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h index 1a7525bfc7..21f36f6c2b 100644 --- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h +++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h @@ -75,7 +75,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32 * @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. @@ -85,7 +85,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h index fee206638b..fd0c0f0c34 100644 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h +++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -45,13 +45,11 @@ inline float32x4x3_t load_matrix_row(const float *ptr) } template -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position); +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) { - ARM_COMPUTE_UNUSED(fixed_point_position); - const float32x4x3_t vtop = { { @@ -108,9 +106,9 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) { - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); @@ -118,9 +116,9 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) { - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); return out; } diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index 908fa13876..d56fd44700 100644 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -55,29 +55,6 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) return r; } -/** Loads a 3x3 matrix as a row (qint8_t). - * - * @param[in] ptr Pointer to a qint8 3x3 matrix. - * @param[in] weights_offset (Optional) Weights quantization offset. - * - * @return The loaded matrix. - */ -inline qint8x8x3_t load_matrix_row(const qint8_t *ptr, int weights_offset = 0) -{ - ARM_COMPUTE_UNUSED(weights_offset); - /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: - r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - const qint8x8x3_t r = - { - { - vld1_dup_qs8(ptr), - vld1_dup_qs8(1 + ptr), - vld1_dup_qs8(2 + ptr) - } - }; - return r; -} - /** Loads a 3x3 matrix as a row (uint8_t). * * @param[in] ptr Pointer to a uint8_t 3x3 matrix. @@ -104,27 +81,25 @@ inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0) /** Perform a convolve3x3 on float32. * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. - * @param[in] input_offset (Optional) Input quantization offset. + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] input_offset (Optional) Input quantization offset. * */ template float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset = 0); + int input_offset = 0); template <> inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); ARM_COMPUTE_UNUSED(input_offset); const float32x4x3_t vtop = @@ -185,11 +160,11 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c template <> inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); @@ -199,145 +174,35 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c template <> inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); return out; } -/** Perform a convolve3x3 on qint16. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -template -qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset = 0); - -template <> -inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset) -{ - ARM_COMPUTE_UNUSED(fixed_point_position); - ARM_COMPUTE_UNUSED(input_offset); - - const qint8x8x3_t vtop = - { - { - vld1_qs8(in_top), - vld1_qs8(in_top + 8), - vld1_qs8(in_top + 16) - } - }; - const qint8x8x3_t vmid = - { - { - vld1_qs8(in_mid), - vld1_qs8(in_mid + 8), - vld1_qs8(in_mid + 16) - } - }; - const qint8x8x3_t vlow = - { - { - vld1_qs8(in_low), - vld1_qs8(in_low + 8), - vld1_qs8(in_low + 16) - } - }; - qint16x8x2_t out = - { - { - vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position), - vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position) - } - }; - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position); - return out; -} - -template <> -inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset) -{ - ARM_COMPUTE_UNUSED(input_offset); - - qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7); - return out; -} - -template <> -inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset) -{ - ARM_COMPUTE_UNUSED(input_offset); - - qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3); - return out; -} - /** Perform a convolve3x3 on uint8_t * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. - * @param[in] input_offset (Optional) Input quantization offset. + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] input_offset (Optional) Input quantization offset. * */ template int32x4x2_t convolve_3x3(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset); + int input_offset); template <> inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); - const int32x4_t v_input_offset = vdupq_n_s32(input_offset); const uint8x8x2_t vtop = @@ -427,11 +292,9 @@ inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, template <> inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); - - int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3); @@ -441,10 +304,9 @@ inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, template <> inline int32x4x2_t convolve_3x3<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); - int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1); return out; } @@ -477,34 +339,6 @@ inline void store_results<3>(float *buffer, const float32x4x2_t &values) vst1_f32(buffer, vget_low_f32(values.val[0])); } -/** Stores a qint16_t array into a memory location. - * - * @param[in] buffer Pointer to the memory location where the values will be stored. - * @param[in] values Values that will be stored. - * - */ -template -void store_results(qint16_t *buffer, const qint16x8x2_t &values); - -template <> -inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1q_qs16(buffer, values.val[0]); - vst1q_qs16(buffer + 8, values.val[1]); -} - -template <> -inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1q_qs16(buffer, values.val[0]); -} - -template <> -inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1_qs16(buffer, vget_low_s16(values.val[0])); -} - /** Stores a uint32_t array into a memory location. * * @param[in] buffer Pointer to the memory location where the values will be stored. @@ -557,25 +391,20 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr) /** Perform a convolve3x3 on float16. * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. * */ template -float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position); +float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2); template <> -inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position) +inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2) { - ARM_COMPUTE_UNUSED(fixed_point_position); - const float16x8x3_t vtop = { { @@ -627,10 +456,9 @@ inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *i } template <> -inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position) +inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2) { - float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3); @@ -638,10 +466,9 @@ inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *i } template <> -inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position) +inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2) { - float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); return out; } diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h index 882e4ec1d0..681e27033e 100644 --- a/arm_compute/core/SubTensorInfo.h +++ b/arm_compute/core/SubTensorInfo.h @@ -98,12 +98,6 @@ public: _parent->set_format(format); return *this; }; - ITensorInfo &set_fixed_point_position(int fixed_point_position) override - { - ARM_COMPUTE_ERROR_ON(_parent == nullptr); - _parent->set_fixed_point_position(fixed_point_position); - return *this; - }; ITensorInfo &set_tensor_shape(const TensorShape &shape) override; ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override { @@ -143,11 +137,6 @@ public: return _parent->offset_element_in_bytes(_coords); } size_t offset_element_in_bytes(const Coordinates &pos) const override; - int fixed_point_position() const override - { - ARM_COMPUTE_ERROR_ON(_parent == nullptr); - return _parent->fixed_point_position(); - } size_t element_size() const override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h index f8cfb35357..1eaf052d8e 100644 --- a/arm_compute/core/TensorInfo.h +++ b/arm_compute/core/TensorInfo.h @@ -86,20 +86,18 @@ public: * * Can be used for automatic derivation of the shape by the function. * - * @param[in] num_channels It indicates the number of channels for each tensor element - * @param[in] data_type Data type to use for each tensor element - * @param[in] fixed_point_position (Optional) It specifies the fixed point position when the tensor data type is QS8, QS16 or QS32. + * @param[in] num_channels It indicates the number of channels for each tensor element + * @param[in] data_type Data type to use for each tensor element */ - TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position = 0); + TensorInfo(size_t num_channels, DataType data_type); /** Constructor * - * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements. - * @param[in] num_channels It indicates the number of channels for each tensor element - * @param[in] data_type Data type to use for each tensor element - * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16. + * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements. + * @param[in] num_channels It indicates the number of channels for each tensor element + * @param[in] data_type Data type to use for each tensor element */ - TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0); + TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type); /** Constructor * @@ -146,20 +144,18 @@ public: * * Can be used for automatic derivation of the shape by the function. * - * @param[in] num_channels Desired number of channels for each tensor element. - * @param[in] data_type Data type to use for each tensor element. - * @param[in] fixed_point_position (Optional) Fixed point position when the tensor data type is QS8, QS16 or QS32. + * @param[in] num_channels Desired number of channels for each tensor element. + * @param[in] data_type Data type to use for each tensor element. */ - void init(size_t num_channels, DataType data_type, size_t fixed_point_position = 0); + void init(size_t num_channels, DataType data_type); /** Initialize the metadata structure with the given parameters * - * @param[in] tensor_shape Size for each dimension of the tensor in number of elements. - * @param[in] num_channels Desired number of channels for each tensor element. - * @param[in] data_type Data type to use for each tensor element. - * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16. + * @param[in] tensor_shape Size for each dimension of the tensor in number of elements. + * @param[in] num_channels Desired number of channels for each tensor element. + * @param[in] data_type Data type to use for each tensor element. */ - void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0); + void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type); /** Initialize the metadata structure with the given parameters * @@ -169,10 +165,9 @@ public: * @param[in] strides_in_bytes Stride in bytes for accessing each dimension of the tensor. * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element. * @param[in] total_size_in_bytes Size in bytes of the memory allocation (including the offset to the first element). - * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16. */ void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, - size_t total_size_in_bytes, int fixed_point_position = 0); + size_t total_size_in_bytes); /** Initialize the metadata structure for the given HOG's metadata * * @param[in] hog_info HOG's metadata used to allocate normalized HOG space @@ -190,19 +185,18 @@ public: * @return Total allocation size including padding in bytes. */ size_t init_auto_padding(const TensorShape &tensor_shape, Format format); - /** Initialize the metadata structure for the given tensor shape, number of channels, - * data type and fixed point position. (Padding is automatically calculated) + /** Initialize the metadata structure for the given tensor shape, number of channels and + * data type. (Padding is automatically calculated) * * @note The padding used by this method is really conservative so that the tensor can be used for most functions. * - * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements - * @param[in] num_channels It indicates the number of channels for each tensor element - * @param[in] data_type Data type to use for each tensor element - * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16. + * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements + * @param[in] num_channels It indicates the number of channels for each tensor element + * @param[in] data_type Data type to use for each tensor element * * @return Total allocation size including padding in bytes. */ - size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0); + size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type); /** Initialize the metadata structure for the given HOG's metadata * * @note init_auto_padding will be used for the tensor initialization. @@ -221,7 +215,6 @@ public: ITensorInfo &set_num_channels(int num_channels) override; ITensorInfo &set_format(Format format) override; ITensorInfo &set_tensor_shape(const TensorShape &shape) override; - ITensorInfo &set_fixed_point_position(int fixed_point_position) override; ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override; ITensorInfo &set_data_layout(const DataLayout &data_layout) override; ITensorInfo &reset_padding() override; @@ -244,10 +237,6 @@ public: return _offset_first_element_in_bytes; } size_t offset_element_in_bytes(const Coordinates &pos) const override; - int fixed_point_position() const override - { - return _fixed_point_position; - } size_t element_size() const override { return data_size_from_type(_data_type) * _num_channels; @@ -318,7 +307,6 @@ private: std::tuple calculate_padding_requirements(const PaddingSize &padding); size_t _total_size; - int _fixed_point_position; size_t _offset_first_element_in_bytes; Strides _strides_in_bytes; size_t _num_channels; diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index da28e131de..89fd4b8bb4 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -74,11 +74,9 @@ enum class DataType UNKNOWN, /**< Unknown data type */ U8, /**< unsigned 8-bit number */ S8, /**< signed 8-bit number */ - QS8, /**< quantized, symmetric fixed-point 8-bit number */ QASYMM8, /**< quantized, asymmetric fixed-point 8-bit number */ U16, /**< unsigned 16-bit number */ S16, /**< signed 16-bit number */ - QS16, /**< quantized, symmetric fixed-point 16-bit number */ U32, /**< unsigned 32-bit number */ S32, /**< signed 32-bit number */ QS32, /**< quantized, symmetric fixed-point 32-bit number */ diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 060d5904d4..cfebfa1506 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -110,13 +110,11 @@ inline size_t data_size_from_type(DataType data_type) { case DataType::U8: case DataType::S8: - case DataType::QS8: case DataType::QASYMM8: return 1; case DataType::U16: case DataType::S16: case DataType::F16: - case DataType::QS16: return 2; case DataType::F32: case DataType::U32: @@ -185,12 +183,10 @@ inline size_t element_size_from_data_type(DataType dt) { case DataType::S8: case DataType::U8: - case DataType::QS8: case DataType::QASYMM8: return 1; case DataType::U16: case DataType::S16: - case DataType::QS16: case DataType::F16: return 2; case DataType::U32: @@ -522,14 +518,10 @@ inline DataType get_promoted_data_type(DataType dt) return DataType::U16; case DataType::S8: return DataType::S16; - case DataType::QS8: - return DataType::QS16; case DataType::U16: return DataType::U32; case DataType::S16: return DataType::S32; - case DataType::QS16: - return DataType::QS32; case DataType::QASYMM8: case DataType::F16: case DataType::U32: @@ -1018,29 +1010,7 @@ inline bool is_data_type_quantized(DataType dt) { switch(dt) { - case DataType::QS8: case DataType::QASYMM8: - case DataType::QS16: - case DataType::QS32: - return true; - default: - return false; - } -} - -/** Check if a given data type is of fixed point type - * - * @param[in] dt Input data type. - * - * @return True if data type is of fixed point type, else false. - */ -inline bool is_data_type_fixed_point(DataType dt) -{ - switch(dt) - { - case DataType::QS8: - case DataType::QS16: - case DataType::QS32: return true; default: return false; diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h index 4ef94f2c6d..1646ebe719 100644 --- a/arm_compute/core/Validate.h +++ b/arm_compute/core/Validate.h @@ -545,71 +545,6 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function, #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \ ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)) -/** Return an error if the passed tensor infos have different fixed point data types or different fixed point positions - * - * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_info_1 The first tensor info to be compared. - * @param[in] tensor_info_2 The second tensor info to be compared. - * @param[in] tensor_infos (Optional) Further allowed tensor infos. - * - * @return Status - */ -template -inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line, - const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) -{ - DataType &&first_data_type = tensor_info_1->data_type(); - const int first_fixed_point_position = tensor_info_1->fixed_point_position(); - - if(!is_data_type_fixed_point(first_data_type)) - { - return arm_compute::Status{}; - } - - const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward(tensor_infos)... } }; - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) - { - return tensor_info->data_type() != first_data_type; - }), - function, file, line, "Tensors have different fixed point data types"); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) - { - return tensor_info->fixed_point_position() != first_fixed_point_position; - }), - function, file, line, "Tensors have different fixed point positions"); - - return arm_compute::Status{}; -} -/** Return an error if the passed tensor have different fixed point data types or different fixed point positions - * - * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_1 The first tensor to be compared. - * @param[in] tensor_2 The second tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. - * - * @return Status - */ -template -inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line, - const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) -{ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(function, file, line, tensor_1->info(), tensor_2->info(), - detail::get_tensor_info_t()(tensors)...)); - return arm_compute::Status{}; -} -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) \ - ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) -#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(...) \ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) - /** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info * * @note: If the first tensor info doesn't have asymmetric quantized data type, the function returns without throwing an error @@ -976,96 +911,5 @@ arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)) #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \ ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)) - -/** Return an error if the input fixed-point positions are different. - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_info_1 The first tensor info to be compared. - * @param[in] tensor_info_2 The second tensor info to be compared. - * @param[in] tensor_infos (Optional) Further allowed tensor infos. - * - * @return Status - */ -template -inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line, - const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) -{ - const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_info_array{ { tensor_info_2, std::forward(tensor_infos)... } }; - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_info_array.begin(), tensor_info_array.end(), [&](const ITensorInfo * tensor_info) - { - return tensor_info->fixed_point_position() != tensor_info_1->fixed_point_position(); - }), - function, file, line, "Tensors have different fixed-point positions"); - return arm_compute::Status{}; -} -/** Return an error if the input fixed-point positions are different. - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_1 The first tensor to be compared. - * @param[in] tensor_2 The second tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. - * - * @return Status - */ -template -inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line, - const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) -{ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(function, file, line, tensor_1->info(), tensor_2->info(), - detail::get_tensor_info_t()(tensors)...)); - return arm_compute::Status{}; -} -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \ - ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)) -#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)) - -/** Return an error if the fixed-point value is not representable in the specified Q format. - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] value The floating point value to be checked. - * @param[in] tensor_info Input tensor info that has information on data type and fixed-point position. - * - * @return Status - */ -inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line, - float value, const ITensorInfo *tensor_info) -{ - const int fixed_point_position = tensor_info->fixed_point_position(); - const DataType dt = tensor_info->data_type(); - const unsigned int q_max_range = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1); - const float max_range = q_max_range / (static_cast(1 << fixed_point_position)); - - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(value > max_range, function, file, line, - "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position); - return arm_compute::Status{}; -} -/** Return an error an error if the fixed-point value is not representable in the specified Q format. - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] value The floating point value to be checked. - * @param[in] tensor Input tensor that has information on data type and fixed-point position. - * - * @return Status - */ -inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line, - float value, const ITensor *tensor) -{ - ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(function, file, line, value, tensor->info())); - return arm_compute::Status{}; -} -#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \ - ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) -#define ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) } #endif /* __ARM_COMPUTE_VALIDATE_H__*/ -- cgit v1.2.1