aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core
diff options
context:
space:
mode:
authorVidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>2018-07-04 09:34:00 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:10 +0000
commit7485d5a62685cb745ab50e970adb722cb71557ac (patch)
treeba01b99ca466c93edc9a3f8c1e34394ff84be060 /arm_compute/core
parent014333d73883c3872e458cedda5ccef586a7ccd4 (diff)
downloadComputeLibrary-7485d5a62685cb745ab50e970adb722cb71557ac.tar.gz
COMPMID-970 : Remove QS8 / QS16 support
Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/core')
-rw-r--r--arm_compute/core/CL/kernels/CLActivationLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h12
-rw-r--r--arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h14
-rw-r--r--arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLCol2ImKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h9
-rw-r--r--arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h8
-rw-r--r--arm_compute/core/CL/kernels/CLFillBorderKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h6
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLIm2ColKernel.h6
-rw-r--r--arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLPermuteKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h12
-rw-r--r--arm_compute/core/CL/kernels/CLPoolingLayerKernel.h5
-rw-r--r--arm_compute/core/CL/kernels/CLReshapeLayerKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h16
-rw-r--r--arm_compute/core/CL/kernels/CLTransposeKernel.h6
-rw-r--r--arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h4
-rw-r--r--arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h4
-rw-r--r--arm_compute/core/CPP/kernels/CPPPermuteKernel.h4
-rw-r--r--arm_compute/core/FixedPoint.h373
-rw-r--r--arm_compute/core/FixedPoint.inl406
-rw-r--r--arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h2
-rw-r--r--arm_compute/core/Helpers.h24
-rw-r--r--arm_compute/core/Helpers.inl14
-rw-r--r--arm_compute/core/ITensorInfo.h15
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.h1184
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl1958
-rw-r--r--arm_compute/core/NEON/kernels/NEActivationLayerKernel.h17
-rw-r--r--arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h20
-rw-r--r--arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h20
-rw-r--r--arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h21
-rw-r--r--arm_compute/core/NEON/kernels/NECol2ImKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h13
-rw-r--r--arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h8
-rw-r--r--arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h8
-rw-r--r--arm_compute/core/NEON/kernels/NEFillBorderKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h6
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEIm2ColKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h16
-rw-r--r--arm_compute/core/NEON/kernels/NEPermuteKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h24
-rw-r--r--arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h50
-rw-r--r--arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h8
-rw-r--r--arm_compute/core/NEON/kernels/NETransposeKernel.h6
-rw-r--r--arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h14
-rw-r--r--arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h249
-rw-r--r--arm_compute/core/SubTensorInfo.h11
-rw-r--r--arm_compute/core/TensorInfo.h54
-rw-r--r--arm_compute/core/Types.h2
-rw-r--r--arm_compute/core/Utils.h30
-rw-r--r--arm_compute/core/Validate.h156
69 files changed, 230 insertions, 4687 deletions
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
index c6d8f96a87..12d00de7e8 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -51,7 +51,7 @@ public:
* @note If the output tensor is a nullptr, the activation function will be performed in-place
*
* @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * of the activation function. Data types supported: QASYMM8/F16/F32.
* @param[out] output Destination tensor. Data type supported: same as @p input
* @param[in] act_info Activation layer information.
*/
@@ -59,7 +59,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * of the activation function. Data types supported: QASYMM8/F16/F32.
* @param[in] output Destination tensor info. Data type supported: same as @p input
* @param[in] act_info Activation layer information.
*
diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
index a33cbf321f..f4275f4153 100644
--- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
+++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
@@ -53,17 +53,17 @@ public:
~CLArithmeticAdditionKernel() = default;
/** Initialise the kernel's inputs, output and convertion policy.
*
- * @param[in] input1 First tensor input. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
- * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32.
- * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8),QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32.
+ * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32.
+ * @param[in] input2 Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
* @param[in] policy Policy to use to handle overflow.
*/
void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
/** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel
*
- * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QASYMM8/QS16/S16/F16/F32.
- * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QASYMM8 (only if @p input1 is QASYMM8), QS16 (only if @p input1 is QS16), S16/F16/F32.
- * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QASYMM8 (only if both inputs are QASYMM8), QS16 (only if both inputs are QS16), S16/F16/F32.
+ * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/S16/F16/F32.
+ * @param[in] input2 Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+ * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32.
* @param[in] policy Policy to use to handle overflow.
*
* @return a status
diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
index c5f862a61f..35b918fe4b 100644
--- a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
+++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,17 +55,17 @@ public:
/** Initialise the kernel's inputs, output and convertion policy.
*
- * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] input2 Second tensor input. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
- * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+ * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32.
+ * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32.
+ * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16/F32.
* @param[in] policy Policy to use to handle overflow.
*/
void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
/** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtractionKernel
*
- * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
- * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
- * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+ * @param[in] input1 First tensor input info. Data types supported: U8/S16/F16/F32.
+ * @param[in] input2 Second tensor input info. Data types supported: U8/S16/F16/F32.
+ * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), S16/F16/F32.
* @param[in] policy Policy to use to handle overflow.
*
* @return a status
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index 8015f08d1b..9c8d02532a 100644
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -54,7 +54,7 @@ public:
*
* @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
* 3 lower dimensions represent a single input with dimensions [width, height, FM].
- * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+ * The rest are optional and used for representing batches. Data types supported: F16/F32.
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
* @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
* @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -69,7 +69,7 @@ public:
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
* 3 lower dimensions represent a single input with dimensions [width, height, FM].
- * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+ * The rest are optional and used for representing batches. Data types supported: F16/F32.
* @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
* @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
* @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
diff --git a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 684a0e5027..f7bd205ec7 100644
--- a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -48,14 +48,14 @@ public:
~CLChannelShuffleLayerKernel() = default;
/** Configure function's inputs and outputs.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: Same as @p input
* @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
*/
void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
/** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: Same as @p input
* @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
*
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
index 3779325efe..94f21b1ebc 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -66,7 +66,7 @@ public:
/** Set the input and output of the kernel.
*
- * @param[in] input The input tensor to convert. Data types supported: QS8/QS16/QASYMM8/F16/F32
+ * @param[in] input The input tensor to convert. Data types supported: QASYMM8/F16/F32
* @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
* while the rest represent batch of outputs. Data types supported: Same as @p input
* @param[in] convolved_dims Output convolved dimensions.
@@ -74,7 +74,7 @@ public:
void configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
/** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
*
- * @param[in] input The input tensor to convert. Data types supported: QS8/QS16/QASYMM8/F16/F32
+ * @param[in] input The input tensor to convert. Data types supported: QASYMM8/F16/F32
* @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
* while the rest represent batch of outputs. Data types supported: Same as @p input
* @param[in] convolved_dims Output convolved dimensions.
diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
index fe24aa9d8c..f5e2f0de89 100644
--- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
@@ -55,7 +55,7 @@ public:
~CLConvertFullyConnectedWeightsKernel() = default;
/** Set the input and output tensor.
*
- * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+ * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
* @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input.
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
* @param[in] data_layout The data layout the weights have been trained in.
@@ -63,7 +63,7 @@ public:
void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
*
- * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+ * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
* @param[in] output The converted weights tensor info. Shape and Data Type: Same as @p input.
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
* @param[in] data_layout The data layout the weights have been trained in.
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
index 467bdfab3b..cbcab8f554 100644
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,7 +52,7 @@ public:
~CLDepthConcatenateLayerKernel() = default;
/** Initialise the kernel's inputs and output
*
- * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32.
+ * @param[in] input Input tensor. Data types supported: F16/F32.
* @param[in] depth_offset The offset on the Z axis.
* @param[in,out] output Output tensor. Data types supported: Same as @p input.
*
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
index 3a6310d69e..7e795c607a 100644
--- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,17 +43,14 @@ public:
*
* Valid conversions Input -> Output :
*
- * - QS8 -> F32
- * - QS16 -> F32
* - U8 -> U16, S16, U32, S32
* - U16 -> U8, U32, S32
* - S16 -> U8, U32, S32
* - U32 -> U8, U16, S16
* - S32 -> U8, U16, S16
- * - F32 -> QS8, QS16
*
- * @param[in] input The input tensor to convert. Data types supported: U8/QS8/U16/S16/QS16/U32/S32/F32.
- * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/QS16/U32/S32/F32.
+ * @param[in] input The input tensor to convert. Data types supported: U8/U16/S16/U32/S32/F32.
+ * @param[out] output The output tensor. Data types supported: U8/U16/S16/U32/S32/F32.
* @param[in] policy Conversion policy
* @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8.
*/
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
index eb1bf58b1b..bd37e35334 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
@@ -56,7 +56,7 @@ public:
* 5x5 convolution with stride_x = 1/2, stride_y = 1/2
*
* @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
@@ -70,7 +70,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
*
* @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h
index 9340e9a8d8..1947a98ba3 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h
@@ -51,11 +51,11 @@ public:
/** Set the accumulate buffer and the biases of the kernel.
*
* @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: S32/QS16/QS32/F16/F32
+ * Data type supported: S32/QS32/F16/F32
* @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
* @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
* Required parameter if output is of QASYMM8 type.
- * Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * Data types supported: QASYMM8/F16/F32
* @param[in] result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
* @param[in] result_shift (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
* @param[in] result_offset_after_shift (Optional)Offset to be applied to result before converting it back to QASYMM8
@@ -65,10 +65,10 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerOutputStageKernel
*
* @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: QS16/QS32/F16/F32
+ * Data type supported: QS32/F16/F32
* @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
* @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
- * Data type supported: QS8/QS16/F16/F32
+ * Data type supported: F16/F32
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr);
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
index 18031c7e7e..20e872eccb 100644
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
@@ -51,7 +51,7 @@ public:
/** Initialise the kernel's input, output and border mode.
*
- * @param[in,out] tensor Tensor to process Data types supported: U8/QS8/S16/QS16/S32/F16/F32.
+ * @param[in,out] tensor Tensor to process Data types supported: U8/S16/S32/F16/F32.
* @param[in] border_size Size of the border to fill in elements.
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index c0fef45afe..7f8e766f1a 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -64,14 +64,14 @@ public:
CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: same as @p input
* @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
*/
void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
* @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
*
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
index 2956f93cdc..f201af0d5e 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,13 +46,13 @@ public:
CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
/** Set the accumulate buffer and the biases of the kernel.
*
- * @param[in, out] accum The accumulate tensor to convert. Data types supported: QS8/QS16/F16/F32
+ * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32
* @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
*/
void configure(ICLTensor *accum, const ICLTensor *biases);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel
*
- * @param[in] accum The accumulate tensor to convert. Data types supported: QS8/QS16/F16/F32
+ * @param[in] accum The accumulate tensor to convert. Data types supported: F16/F32
* @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
* @param[in] gpu_target GPU target
*
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
index 3755d943c5..bf8e1d4b17 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
@@ -52,14 +52,14 @@ public:
*
* @note The input and output tensors must have the same dimensions
*
- * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32
+ * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32
* @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref CLGEMMMatrixMultiplyKernel. Data type supported: same as @p input
* @param[in] beta Weight of matrix C
*/
void configure(const ICLTensor *input, ICLTensor *output, float beta);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAdditionKernel.
*
- * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32
+ * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32
* @param[in] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref CLGEMMMatrixMultiplyKernel. Data type supported: same as @p input
* @param[in] beta Weight of matrix C
*
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 15bba0cd0f..1b6a0c87a9 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -53,7 +53,7 @@ public:
CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
/** Initialise the kernel's input, output and alpha
*
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F16/F32
* @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
@@ -64,7 +64,7 @@ public:
void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
*
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F16/F32
* @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0
* @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 9a3069eab6..47a4ad515b 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -70,14 +70,14 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: same as @p input
* @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
*/
void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor. Data type supported: same as @p input.
* @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block
*
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index 7e119a32a8..fc930abcbe 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -69,7 +69,7 @@ public:
/** Set the input and output of the kernel.
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
* @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
* while every dimension above represents a batch. Data types supported: Same as @p input
* @param[in] kernel_dims The kernel dimensions (width and height).
@@ -81,7 +81,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
* @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
* while every dimension above represents a batch. Data types supported: Same as @p input
* @param[in] kernel_dims The kernel dimensions (width and height).
@@ -113,7 +113,7 @@ private:
/** Chooses and configure the right kernel for the given input arguments.
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
* @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
* while every dimension above represents a batch. Data types supported: Same as @p input
* @param[in] kernel_dims The kernel dimensions (width and height).
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
index ef00e59e5c..f2d37a781c 100644
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -48,7 +48,7 @@ public:
/** Set the input and output tensors.
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
- * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32.
+ * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32.
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*/
@@ -56,7 +56,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
- * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32.
+ * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32.
* @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*
diff --git a/arm_compute/core/CL/kernels/CLPermuteKernel.h b/arm_compute/core/CL/kernels/CLPermuteKernel.h
index b01df64ebd..21da141c0d 100644
--- a/arm_compute/core/CL/kernels/CLPermuteKernel.h
+++ b/arm_compute/core/CL/kernels/CLPermuteKernel.h
@@ -49,14 +49,14 @@ public:
CLPermuteKernel &operator=(CLPermuteKernel &&) = default;
/** Set the input and output of the kernel.
*
- * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output The output tensor. Data types supported: Same as @p input
* @param[in] perm Permutation vector
*/
void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
/** Static function to check if given info will lead to a valid configuration of @ref CLPermuteKernel
*
- * @param[in] input First tensor input info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32.
+ * @param[in] input First tensor input info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
* @param[in] output Output tensor info. Data types supported: same as @p input.
* @param[in] perm Permutation vector
*
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index fcabb614df..b835aa701b 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -49,11 +49,11 @@ public:
CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
/** Initialise the kernel's input, output and border mode.
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32.
* @param[in] input2 An input tensor. Data types supported: same as @p input1.
- * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
* @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
*/
@@ -61,11 +61,11 @@ public:
ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
/** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
*
- * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32.
* @param[in] input2 An input tensor info. Data types supported: same as @p input1.
- * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+ * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
* @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
* @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
*
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index c13507785b..db1a756229 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -51,16 +51,15 @@ public:
/** Set the input and output tensors.
*
- * @note QS8 and QS16 are supported only for pool sizes 3, 5 and 7
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
*
- * @param[in] input Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32.
* @param[in] output Destination tensor info. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*
diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
index 044b5e7006..b253d66f4f 100644
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,7 @@ public:
~CLReshapeLayerKernel() = default;
/** Set the input and output of the kernel
*
- * @param[in] input Source tensor. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
+ * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
* @param[out] output Destination tensor. Data type supported: Same as @p input
*/
void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index c562565175..b272878fe7 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -38,13 +38,13 @@ class CLLogits1DMaxKernel : public ICLSimple3DKernel
public:
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32
* @param[out] output Destination tensor. Data types supported: same as @p input
*/
void configure(const ICLTensor *input, ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32
* @param[in] output Destination tensor. Data types supported: same as @p input
*
* @return a status
@@ -68,7 +68,7 @@ public:
CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32
* @param[in] max Max values tensor. Data types supported: same as @p input
* @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
* @param[out] sum Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
@@ -77,7 +77,7 @@ public:
void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
/** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32
* @param[in] max Max values tensor. Data types supported: same as @p input
* @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
* @param[in] sum Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
@@ -116,7 +116,7 @@ public:
CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input Source tensor. Data types supported: F16/F32
* @param[in,out] max Max values tensor. Data types supported: same as @p input
* @param[out] output Destination tensor. Data types supported: same as @p input
* @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input
@@ -125,7 +125,7 @@ public:
void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
/** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
*
- * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input Source tensor. Data types supported: F16/F32
* @param[in] max Max values tensor. Data types supported: same as @p input
* @param[in] output Destination tensor. Data types supported: same as @p input
* @param[in] sum Sum of 1D logits tensor. Data types supported: same as @p input
@@ -175,7 +175,7 @@ public:
CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32
+ * @param[in] input Source tensor. Data types supported: S32/F16/F32
* @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
* @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
* @param[in] beta (Optional) A scaling factor for the exponent. (Default = 1.0)
@@ -183,7 +183,7 @@ public:
void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta = 1.0f);
/** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
*
- * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32
+ * @param[in] input Source tensor. Data types supported: S32/F16/F32
* @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
* @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
*
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
index 2e1b481d3f..09d7a8a430 100644
--- a/arm_compute/core/CL/kernels/CLTransposeKernel.h
+++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,13 +40,13 @@ class CLTransposeKernel : public ICLSimple2DKernel
public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: Same as @p input
*/
void configure(const ICLTensor *input, ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor. Data type supported: Same as @p input
*
* @return a status
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index 7a54284199..664fc3c216 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -69,7 +69,7 @@ public:
/** Set the input and output of the kernel.
*
* @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QS16/QASYMM8/F16/F32
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32
* @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -79,7 +79,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
*
* @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QS16/QASYMM8/F16/F32
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32
* @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
index 5b8a318320..d206eb0da7 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
@@ -52,7 +52,7 @@ public:
~CLWidthConcatenateLayerKernel() = default;
/** Initialise the kernel's inputs and output
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] width_offset The offset on the X axis.
* @param[in,out] output Output tensor. Data types supported: Same as @p input.
*
@@ -60,7 +60,7 @@ public:
void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] width_offset The offset on the X axis.
* @param[in] output Output tensor info. Data types supported: Same as @p input.
*
diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
index 3d6b43641e..5e9ae43ee0 100644
--- a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
@@ -56,14 +56,14 @@ public:
/** Set the input and output of the kernel.
*
- * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output The output tensor. Data types supported: Same as @p input
* @param[in] perm Permutation vector
*/
void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
/** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
*
- * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output The output tensor. Data types supported: Same as @p input
* @param[in] perm Permutation vector
*
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
deleted file mode 100644
index 6e00500b10..0000000000
--- a/arm_compute/core/FixedPoint.h
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_FIXEDPOINT_H__
-#define __ARM_COMPUTE_FIXEDPOINT_H__
-
-#include <cstdint>
-
-namespace arm_compute
-{
-using qint8_t = int8_t; /**< 8 bit fixed point scalar value */
-using qint16_t = int16_t; /**< 16 bit fixed point scalar value */
-using qint32_t = int32_t; /**< 32 bit fixed point scalar value */
-using qint64_t = int64_t; /**< 64 bit fixed point scalar value */
-
-/** 8 bit fixed point scalar saturating shift left
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] shift Shift amount (positive only values)
- *
- * @return The result of the 8 bit fixed point shift. The result is saturated in case of overflow
- */
-qint8_t sqshl_qs8(qint8_t a, int shift);
-
-/** 8 bit fixed point scalar shift right
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] shift Shift amount (positive only values)
- *
- * @return The result of the 8 bit fixed point shift
- */
-qint8_t sshr_qs8(qint8_t a, int shift);
-
-/** 16 bit fixed point scalar shift right
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] shift Shift amount (positive only values)
- *
- * @return The result of the 16 bit fixed point shift
- */
-qint16_t sshr_qs16(qint16_t a, int shift);
-
-/** 16 bit fixed point scalar saturating shift left
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] shift Shift amount (positive only values)
- *
- * @return The result of the 16 bit fixed point shift. The result is saturated in case of overflow
- */
-qint16_t sqshl_qs16(qint16_t a, int shift);
-
-/** 8 bit fixed point scalar absolute value
- *
- * @param[in] a 8 bit fixed point input
- *
- * @return The result of the 8 bit fixed point absolute value
- */
-qint8_t sabs_qs8(qint8_t a);
-
-/** 16 bit fixed point scalar absolute value
- *
- * @param[in] a 16 bit fixed point input
- *
- * @return The result of the 16 bit fixed point absolute value
- */
-qint16_t sabs_qs16(qint16_t a);
-
-/** 8 bit fixed point scalar add
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- *
- * @return The result of the 8 bit fixed point addition
- */
-qint8_t sadd_qs8(qint8_t a, qint8_t b);
-
-/** 16 bit fixed point scalar add
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- *
- * @return The result of the 16 bit fixed point addition
- */
-qint16_t sadd_qs16(qint16_t a, qint16_t b);
-
-/** 8 bit fixed point scalar saturating add
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- *
- * @return The result of the 8 bit fixed point addition. The result is saturated in case of overflow
- */
-qint8_t sqadd_qs8(qint8_t a, qint8_t b);
-
-/** 16 bit fixed point scalar saturating add
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- *
- * @return The result of the 16 bit fixed point addition. The result is saturated in case of overflow
- */
-qint16_t sqadd_qs16(qint16_t a, qint16_t b);
-
-/** 32 bit fixed point scalar saturating add
- *
- * @param[in] a First 32 bit fixed point input
- * @param[in] b Second 32 bit fixed point input
- *
- * @return The result of the 32 bit fixed point addition. The result is saturated in case of overflow
- */
-qint32_t sqadd_qs32(qint32_t a, qint32_t b);
-
-/** 8 bit fixed point scalar subtraction
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- *
- * @return The result of the 8 bit fixed point subtraction
- */
-qint8_t ssub_qs8(qint8_t a, qint8_t b);
-
-/** 16 bit fixed point scalar subtraction
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- *
- * @return The result of the 16 bit fixed point subtraction
- */
-qint16_t ssub_qs16(qint16_t a, qint16_t b);
-
-/** 8 bit fixed point scalar saturating subtraction
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- *
- * @return The result of the 8 bit fixed point subtraction. The result is saturated in case of overflow
- */
-qint8_t sqsub_qs8(qint8_t a, qint8_t b);
-
-/** 16 bit fixed point scalar saturating subtraction
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- *
- * @return The result of the 16 bit fixed point subtraction. The result is saturated in case of overflow
- */
-qint16_t sqsub_qs16(qint16_t a, qint16_t b);
-
-/** 8 bit fixed point scalar multiply
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point multiplication.
- */
-qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
-
-/** 16 bit fixed point scalar multiply
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point multiplication.
- */
-qint16_t smul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
-
-/** 8 bit fixed point scalar saturating multiply
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point multiplication. The result is saturated in case of overflow
- */
-qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
-
-/** 16 bit fixed point scalar saturating multiply
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
- */
-qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
-
-/** 8 bit fixed point scalar multiply long
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point multiplication long. The result is saturated in case of overflow
- */
-qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position);
-
-/** 16 bit fixed point scalar multiply long
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point multiplication long. The result is saturated in case of overflow
- */
-qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position);
-
-/** 16 bit fixed point scalar saturating multiply
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
- */
-qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
-
-/** 8 bit fixed point scalar inverse square root
- *
- * @param[in] a 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point inverse square root.
- */
-qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position);
-
-/** 16 bit fixed point scalar inverse square root
- *
- * @param[in] a 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point inverse square root.
- */
-qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position);
-
-/** 8 bit fixed point scalar division
- *
- * @param[in] a First 8 bit fixed point input
- * @param[in] b Second 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point division.
- */
-qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position);
-
-/** 16 bit fixed point scalar division
- *
- * @param[in] a First 16 bit fixed point input
- * @param[in] b Second 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point division.
- */
-qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position);
-
-/** 8 bit fixed point scalar exponential
- *
- * @param[in] a 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point exponential.
- */
-qint8_t sqexp_qs8(qint8_t a, int fixed_point_position);
-
-/** 16 bit fixed point scalar exponential
- *
- * @param[in] a 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point exponential.
- */
-qint16_t sqexp_qs16(qint16_t a, int fixed_point_position);
-
-/** 16 bit fixed point scalar exponential
- *
- * @param[in] a 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point exponential.
- */
-qint16_t sexp_qs16(qint16_t a, int fixed_point_position);
-
-/** 8 bit fixed point scalar logarithm
- *
- * @param[in] a 8 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point logarithm.
- */
-qint8_t slog_qs8(qint8_t a, int fixed_point_position);
-
-/** 16 bit fixed point scalar logarithm
- *
- * @param[in] a 16 bit fixed point input
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point logarithm.
- */
-qint16_t slog_qs16(qint16_t a, int fixed_point_position);
-
-/** Convert an 8 bit fixed point to float
- *
- * @param[in] a Input to convert
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 8 bit fixed point -> float
- */
-float scvt_f32_qs8(qint8_t a, int fixed_point_position);
-
-/** Convert a float to 8 bit fixed point
- *
- * @param[in] a Input to convert
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 8 bit fixed point
- */
-qint8_t sqcvt_qs8_f32(float a, int fixed_point_position);
-
-/** Convert a 16 bit fixed point to float
- *
- * @param[in] a Input to convert
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 16 bit fixed point -> float
- */
-float scvt_f32_qs16(qint16_t a, int fixed_point_position);
-
-/** Convert a float to 16 bit fixed point
- *
- * @param[in] a Input to convert
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 16 bit fixed point
- */
-qint16_t sqcvt_qs16_f32(float a, int fixed_point_position);
-
-/** Scalar saturating move and narrow.
- *
- * @param[in] a Input to convert to 8 bit fixed point
- *
- * @return The narrowing conversion to 8 bit
- */
-qint8_t sqmovn_qs16(qint16_t a);
-
-/** Scalar saturating move and narrow.
- *
- * @param[in] a Input to convert to 16 bit fixed point
- *
- * @return The narrowing conversion to 16 bit
- */
-qint16_t sqmovn_qs32(qint32_t a);
-}
-#include "arm_compute/core/FixedPoint.inl"
-#endif /* __ARM_COMPUTE_FIXEDPOINT_H__ */
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
deleted file mode 100644
index eb3516e8db..0000000000
--- a/arm_compute/core/FixedPoint.inl
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-
-#include <cmath>
-#include <limits>
-
-namespace arm_compute
-{
-inline qint8_t sqshl_qs8(qint8_t a, int shift)
-{
- qint16_t tmp = static_cast<qint16_t>(a) << shift;
-
- // Saturate the result in case of overflow and cast to qint8_t
- return utility::saturate_cast<qint8_t>(tmp);
-}
-
-inline qint16_t sqshl_qs16(qint16_t a, int shift)
-{
- qint32_t tmp = static_cast<qint32_t>(a) << shift;
-
- // Saturate the result in case of overflow and cast to qint16_t
- return utility::saturate_cast<qint16_t>(tmp);
-}
-
-inline qint8_t sshr_qs8(qint8_t a, int shift)
-{
- ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero");
- const qint8_t round_val = 1 << (shift - 1);
- return sqadd_qs8(a, round_val) >> shift;
-}
-
-inline qint16_t sshr_qs16(qint16_t a, int shift)
-{
- ARM_COMPUTE_ERROR_ON_MSG(shift == 0, "Shift should not be zero");
- const qint16_t round_val = 1 << (shift - 1);
- return sqadd_qs16(a, round_val) >> shift;
-}
-
-inline qint8_t sabs_qs8(qint8_t a)
-{
- return (a < 0) ? (a == std::numeric_limits<int8_t>::min()) ? std::numeric_limits<int8_t>::max() : -a : a;
-}
-
-inline qint16_t sabs_qs16(qint16_t a)
-{
- return (a < 0) ? (a == std::numeric_limits<int16_t>::min()) ? std::numeric_limits<int16_t>::max() : -a : a;
-}
-
-inline qint8_t sadd_qs8(qint8_t a, qint8_t b)
-{
- return a + b;
-}
-
-inline qint16_t sadd_qs16(qint16_t a, qint16_t b)
-{
- return a + b;
-}
-
-inline qint8_t sqadd_qs8(qint8_t a, qint8_t b)
-{
- // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
- qint16_t tmp = (static_cast<qint16_t>(a) + static_cast<qint16_t>(b));
-
- // Saturate the result in case of overflow and cast to qint8_t
- return utility::saturate_cast<qint8_t>(tmp);
-}
-
-inline qint16_t sqadd_qs16(qint16_t a, qint16_t b)
-{
- // We need to store the temporary result in qint32_t otherwise we cannot evaluate the overflow
- qint32_t tmp = (static_cast<qint32_t>(a) + static_cast<qint32_t>(b));
-
- // Saturate the result in case of overflow and cast to qint16_t
- return utility::saturate_cast<qint16_t>(tmp);
-}
-
-inline qint32_t sqadd_qs32(qint32_t a, qint32_t b)
-{
- // We need to store the temporary result in qint64_t otherwise we cannot evaluate the overflow
- qint64_t tmp = (static_cast<qint64_t>(a) + static_cast<qint64_t>(b));
-
- // Saturate the result in case of overflow and cast to qint32_t
- return utility::saturate_cast<qint32_t>(tmp);
-}
-
-inline qint8_t ssub_qs8(qint8_t a, qint8_t b)
-{
- return a - b;
-}
-
-inline qint16_t ssub_qs16(qint16_t a, qint16_t b)
-{
- return a - b;
-}
-
-inline qint8_t sqsub_qs8(qint8_t a, qint8_t b)
-{
- // We need to store the temporary result in uint16_t otherwise we cannot evaluate the overflow
- qint16_t tmp = static_cast<qint16_t>(a) - static_cast<qint16_t>(b);
-
- // Saturate the result in case of overflow and cast to qint8_t
- return utility::saturate_cast<qint8_t>(tmp);
-}
-
-inline qint16_t sqsub_qs16(qint16_t a, qint16_t b)
-{
- // We need to store the temporary result in qint32_t otherwise we cannot evaluate the overflow
- qint32_t tmp = static_cast<qint32_t>(a) - static_cast<qint32_t>(b);
-
- // Saturate the result in case of overflow and cast to qint16_t
- return utility::saturate_cast<qint16_t>(tmp);
-}
-
-inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
-{
- const qint16_t round_up_const = (1 << (fixed_point_position - 1));
-
- qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
-
- // Rounding up
- tmp += round_up_const;
-
- return static_cast<qint8_t>(tmp >> fixed_point_position);
-}
-
-inline qint16_t smul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
-{
- const qint32_t round_up_const = (1 << (fixed_point_position - 1));
-
- qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
-
- // Rounding up
- tmp += round_up_const;
-
- return static_cast<qint16_t>(tmp >> fixed_point_position);
-}
-
-inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
-{
- const qint16_t round_up_const = (1 << (fixed_point_position - 1));
-
- qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
-
- // Rounding up
- tmp += round_up_const;
-
- return utility::saturate_cast<qint8_t>(tmp >> fixed_point_position);
-}
-
-inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
-{
- const qint32_t round_up_const = (1 << (fixed_point_position - 1));
-
- qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
-
- // Rounding up
- tmp += round_up_const;
-
- return utility::saturate_cast<qint16_t>(tmp >> fixed_point_position);
-}
-
-inline qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position)
-{
- const qint16_t round_up_const = (1 << (fixed_point_position - 1));
-
- qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
-
- // Rounding up
- tmp += round_up_const;
-
- return tmp >> fixed_point_position;
-}
-
-inline qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position)
-{
- const qint32_t round_up_const = (1 << (fixed_point_position - 1));
-
- qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
-
- // Rounding up
- tmp += round_up_const;
-
- return tmp >> fixed_point_position;
-}
-
-inline qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position)
-{
- const qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24));
-
- const qint8_t const_three = (3 << fixed_point_position);
- qint8_t temp = shift < 0 ? (a << -shift) : (a >> shift);
- qint8_t x2 = temp;
-
- // We need three iterations to find the result
- for(int i = 0; i < 3; ++i)
- {
- qint8_t three_minus_dx = ssub_qs8(const_three, smul_qs8(temp, smul_qs8(x2, x2, fixed_point_position), fixed_point_position));
- x2 = (smul_qs8(x2, three_minus_dx, fixed_point_position) >> 1);
- }
-
- temp = shift < 0 ? (x2 << (-shift >> 1)) : (x2 >> (shift >> 1));
-
- return temp;
-}
-
-inline qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position)
-{
- const qint16_t shift = 16 - (fixed_point_position + (__builtin_clz(a) - 16));
-
- const qint16_t const_three = (3 << fixed_point_position);
- qint16_t temp = shift < 0 ? (a << -shift) : (a >> shift);
- qint16_t x2 = temp;
-
- // We need three iterations to find the result
- for(int i = 0; i < 3; ++i)
- {
- qint16_t three_minus_dx = ssub_qs16(const_three, smul_qs16(temp, smul_qs16(x2, x2, fixed_point_position), fixed_point_position));
- x2 = smul_qs16(x2, three_minus_dx, fixed_point_position) >> 1;
- }
-
- temp = shift < 0 ? (x2 << ((-shift) >> 1)) : (x2 >> (shift >> 1));
-
- return temp;
-}
-
-inline qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position)
-{
- const qint16_t temp = a << fixed_point_position;
- return static_cast<qint8_t>(temp / b);
-}
-
-inline qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position)
-{
- const qint32_t temp = a << fixed_point_position;
- return static_cast<qint16_t>(temp / b);
-}
-
-inline qint8_t sqexp_qs8(qint8_t a, int fixed_point_position)
-{
- // Constants
- const qint8_t const_one = (1 << fixed_point_position);
- const qint8_t ln2 = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1;
- const qint8_t inv_ln2 = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one;
- const qint8_t A = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1;
- const qint8_t B = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1;
- const qint8_t C = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1;
- const qint8_t D = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1;
-
- // Polynomial expansion
- const int dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position);
- const qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position)));
- qint8_t sum = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C);
- sum = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B);
- sum = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A);
- sum = sqmul_qs8(alpha, sum, fixed_point_position);
- sum = sqadd_qs8(sum, const_one);
-
- return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs8(sum, dec_a);
-}
-
-inline qint16_t sqexp_qs16(qint16_t a, int fixed_point_position)
-{
- // Constants
- const qint16_t const_one = (1 << fixed_point_position);
- const qint16_t ln2 = ((0x58B9 >> (14 - fixed_point_position)) + 1) >> 1;
- const qint16_t inv_ln2 = (((0x38AA >> (14 - fixed_point_position)) + 1) >> 1) | const_one;
- const qint16_t A = ((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1;
- const qint16_t B = ((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1;
- const qint16_t C = ((0x1693 >> (14 - fixed_point_position)) + 1) >> 1;
- const qint16_t D = ((0x0592 >> (14 - fixed_point_position)) + 1) >> 1;
-
- // Polynomial expansion
- const int dec_a = (sqmul_qs16(a, inv_ln2, fixed_point_position) >> fixed_point_position);
- const qint16_t alpha = sabs_qs16(sqsub_qs16(a, sqmul_qs16(ln2, sqshl_qs16(dec_a, fixed_point_position), fixed_point_position)));
- qint16_t sum = sqadd_qs16(sqmul_qs16(alpha, D, fixed_point_position), C);
- sum = sqadd_qs16(sqmul_qs16(alpha, sum, fixed_point_position), B);
- sum = sqadd_qs16(sqmul_qs16(alpha, sum, fixed_point_position), A);
- sum = sqmul_qs16(alpha, sum, fixed_point_position);
- sum = sqadd_qs16(sum, const_one);
-
- return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs16(sum, dec_a);
-}
-
-inline qint8_t slog_qs8(qint8_t a, int fixed_point_position)
-{
- // Constants
- qint8_t const_one = (1 << fixed_point_position);
- qint8_t ln2 = (0x58 >> (7 - fixed_point_position));
- qint8_t A = (0x5C >> (7 - fixed_point_position - 1));
- qint8_t B = -(0x56 >> (7 - fixed_point_position));
- qint8_t C = (0x29 >> (7 - fixed_point_position));
- qint8_t D = -(0x0A >> (7 - fixed_point_position));
-
- if((const_one == a) || (a < 0))
- {
- return 0;
- }
- else if(a < const_one)
- {
- return -slog_qs8(sdiv_qs8(const_one, a, fixed_point_position), fixed_point_position);
- }
-
- // Remove even powers of 2
- qint8_t shift_val = 31 - __builtin_clz(a >> fixed_point_position);
- a >>= shift_val;
- a = ssub_qs8(a, const_one);
-
- // Polynomial expansion
- qint8_t sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C);
- sum = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B);
- sum = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A);
- sum = sqmul_qs8(a, sum, fixed_point_position);
-
- return smul_qs8(sadd_qs8(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
-}
-
-inline qint16_t slog_qs16(qint16_t a, int fixed_point_position)
-{
- // Constants
- qint16_t const_one = (1 << fixed_point_position);
- qint16_t ln2 = (0x58B9 >> (7 - fixed_point_position));
- qint16_t A = (0x5C0F >> (7 - fixed_point_position - 1));
- qint16_t B = -(0x56AE >> (7 - fixed_point_position));
- qint16_t C = (0x2933 >> (7 - fixed_point_position));
- qint16_t D = -(0x0AA7 >> (7 - fixed_point_position));
-
- if((const_one == a) || (a < 0))
- {
- return 0;
- }
- else if(a < const_one)
- {
- return -slog_qs16(sdiv_qs16(const_one, a, fixed_point_position), fixed_point_position);
- }
-
- // Remove even powers of 2
- qint16_t shift_val = 31 - __builtin_clz(a >> fixed_point_position);
- a >>= shift_val;
- a = ssub_qs16(a, const_one);
-
- // Polynomial expansion
- qint16_t sum = sqadd_qs16(sqmul_qs16(a, D, fixed_point_position), C);
- sum = sqadd_qs16(sqmul_qs16(a, sum, fixed_point_position), B);
- sum = sqadd_qs16(sqmul_qs16(a, sum, fixed_point_position), A);
- sum = sqmul_qs16(a, sum, fixed_point_position);
-
- return smul_qs16(sadd_qs16(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
-}
-
-inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
-{
- return static_cast<float>(a) / (1 << fixed_point_position);
-}
-
-inline qint8_t sqcvt_qs8_f32(float a, int fixed_point_position)
-{
- // round_nearest_integer(a * 2^(fixed_point_position))
- return utility::saturate_cast<qint8_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
-}
-
-inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
-{
- return static_cast<float>(a) / (1 << fixed_point_position);
-}
-
-inline qint16_t sqcvt_qs16_f32(float a, int fixed_point_position)
-{
- // round_nearest_integer(a * 2^(fixed_point_position))
- return utility::saturate_cast<qint16_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
-}
-
-inline qint8_t sqmovn_qs16(qint16_t a)
-{
- // Saturate the result in case of overflow and cast to qint8_t
- return utility::saturate_cast<qint8_t>(a);
-}
-
-inline qint16_t sqmovn_qs32(qint32_t a)
-{
- // Saturate the result in case of overflow and cast to qint16_t
- return utility::saturate_cast<qint16_t>(a);
-}
-}
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
index bf971a2729..fcbc3495c3 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
@@ -66,7 +66,7 @@ public:
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
* 3 lower dimensions represent a single input with dimensions [width, height, FM].
- * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+ * The rest are optional and used for representing batches. Data types supported: F16/F32.
* @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
* @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
* @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index a3cbfb94e3..374e36442b 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -582,21 +582,19 @@ inline void permute(TensorShape &shape, const PermutationVector &perm)
}
}
-/** Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty.
+/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty.
*
- * @param[in,out] info Tensor info used to check and assign.
- * @param[in] shape New shape.
- * @param[in] num_channels New number of channels.
- * @param[in] data_type New data type
- * @param[in] fixed_point_position New fixed point position
- * @param[in] quantization_info (Optional) New quantization info
+ * @param[in,out] info Tensor info used to check and assign.
+ * @param[in] shape New shape.
+ * @param[in] num_channels New number of channels.
+ * @param[in] data_type New data type
+ * @param[in] quantization_info (Optional) New quantization info
*
* @return True if the tensor info has been initialized
*/
bool auto_init_if_empty(ITensorInfo &info,
const TensorShape &shape,
int num_channels, DataType data_type,
- int fixed_point_position,
QuantizationInfo quantization_info = QuantizationInfo());
/** Auto initialize the tensor info using another tensor info.
@@ -647,16 +645,6 @@ bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
*/
bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout);
-/** Set the fixed point position to the specified value if
- * the current fixed point position is 0 and the data type is QS8 or QS16
- *
- * @param[in,out] info Tensor info used to check and assign.
- * @param[in] fixed_point_position New fixed point position
- *
- * @return True if the fixed point position has been changed.
- */
-bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position);
-
/** Set the quantization info to the specified value if
* the current quantization info is empty and the data type of asymmetric quantized type
*
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index b359811328..c0e4ab8d7d 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -202,7 +202,6 @@ inline bool auto_init_if_empty(ITensorInfo &info,
const TensorShape &shape,
int num_channels,
DataType data_type,
- int fixed_point_position,
QuantizationInfo quantization_info)
{
if(info.tensor_shape().total_size() == 0)
@@ -210,7 +209,6 @@ inline bool auto_init_if_empty(ITensorInfo &info,
info.set_data_type(data_type);
info.set_num_channels(num_channels);
info.set_tensor_shape(shape);
- info.set_fixed_point_position(fixed_point_position);
info.set_quantization_info(quantization_info);
return true;
}
@@ -225,7 +223,6 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s
info_sink.set_data_type(info_source.data_type());
info_sink.set_num_channels(info_source.num_channels());
info_sink.set_tensor_shape(info_source.tensor_shape());
- info_sink.set_fixed_point_position(info_source.fixed_point_position());
info_sink.set_quantization_info(info_source.quantization_info());
info_sink.set_data_layout(info_source.data_layout());
return true;
@@ -278,17 +275,6 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout
return false;
}
-inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position)
-{
- if(info.fixed_point_position() == 0 && (info.data_type() == DataType::QS8 || info.data_type() == DataType::QS16))
- {
- info.set_fixed_point_position(fixed_point_position);
- return true;
- }
-
- return false;
-}
-
inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
{
if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index ce0cf53fdf..f113445fb7 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -81,15 +81,6 @@ public:
* @return Reference to this ITensorInfo object
*/
virtual ITensorInfo &set_tensor_shape(const TensorShape &shape) = 0;
- /** Set the fixed point position to the specified value
- *
- * @warning The fixed point position must be set once the data type has been configured
- *
- * @param[in] fixed_point_position The new fixed point position
- *
- * @return Reference to this ITensorInfo object
- */
- virtual ITensorInfo &set_fixed_point_position(int fixed_point_position) = 0;
/** Set the quantization settings (scale and offset) of the tensor.
*
* @param[in] quantization_info QuantizationInfo containing the scale and offset
@@ -158,11 +149,7 @@ public:
* @return Offset in bytes from the beginning of the memory allocation to access the element (x, y, z, ...)
*/
virtual size_t offset_element_in_bytes(const Coordinates &pos) const = 0;
- /** Fixed point position used when the tensor data type is QS8 or QS16
- *
- * @return The fixed point position that expresses the number of bits for the fractional part of the number
- */
- virtual int fixed_point_position() const = 0;
+
/** Element size in bytes calculated as data_size() * num_channels()
*
* @return The size of one element in bytes
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 504ec6c444..ce64a8e58b 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -24,1194 +24,10 @@
#ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__
#define __ARM_COMPUTE_NEFIXEDPOINT_H__
-#include "arm_compute/core/FixedPoint.h"
-
#include <arm_neon.h>
namespace arm_compute
{
-using qint8x8_t = int8x8_t; /**< 8 bit fixed point vector with 8 elements */
-using qint8x8x2_t = int8x8x2_t; /**< 8 bit fixed point vector with 16 elements */
-using qint8x8x3_t = int8x8x3_t; /**< 8 bit fixed point vector with 24 elements */
-using qint8x8x4_t = int8x8x4_t; /**< 8 bit fixed point vector with 32 elements */
-using qint8x16_t = int8x16_t; /**< 8 bit fixed point vector with 16 elements */
-using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */
-using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */
-using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */
-using qint16x4_t = int16x4_t; /**< 16 bit fixed point vector with 4 elements */
-using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */
-using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */
-using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */
-using qint16x8_t = int16x8_t; /**< 16 bit fixed point vector with 8 elements */
-using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
-using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
-using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
-using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */
-using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */
-using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */
-
-/** Get the lower half of a 16 elements vector
- *
- * @param[in] a vector of 16 elements
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vget_low_qs8(qint8x16_t a);
-
-/** Get the lower half of a 16 elements vector
- *
- * @param[in] a vector of 8 elements
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vget_low_qs16(qint16x8_t a);
-
-/** Get the higher half of a 16 elements vector
- *
- * @param[in] a vector of 16 elements
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vget_high_qs8(qint8x16_t a);
-
-/** Get the higher half of a 16 elements vector
- *
- * @param[in] a vector of 8 elements
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vget_high_qs16(qint16x8_t a);
-
-/** Load a single 8 bit fixed point vector from memory (8 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point vector to load
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vld1_qs8(const qint8_t *addr);
-
-/** Load a single 16 bit fixed point vector from memory (4 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point vector to load
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vld1_qs16(const qint16_t *addr);
-
-/** Load a single 8 bit fixed point vector from memory (16 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point vector to load
- *
- * @return 8 bit fixed point vector (16 elements)
- */
-qint8x16_t vld1q_qs8(const qint8_t *addr);
-
-/** Load a single 16 bit fixed point vector from memory (8 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point vector to load
- *
- * @return 16 bit fixed point vector (8 elements)
- */
-qint16x8_t vld1q_qs16(const qint16_t *addr);
-
-/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vld1_dup_qs8(const qint8_t *addr);
-
-/** Load all lanes of 16 bit fixed point vector with same value from memory (4 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point scalar value to load
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vld1_dup_qs16(const qint16_t *addr);
-
-/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
- *
- * @return 8 bit fixed point vector (16 elements)
- */
-qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
-
-/** Load all lanes of 16 bit fixed point vector with same value from memory (8 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point scalar value to load
- *
- * @return 16 bit fixed point vector (8 elements)
- */
-qint16x8_t vld1q_dup_qs16(const qint16_t *addr);
-
-/** Load two 16 bit fixed point vectors from memory (8x2 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point vectors to load
- *
- * @return 16 bit fixed point vectors (8x2 elements)
- */
-qint16x8x2_t vld2q_qs16(qint16_t *addr);
-
-/** Store a single 8 bit fixed point vector to memory (8 elements)
- *
- * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
- * @param[in] b 8 bit fixed point vector to store
- *
- */
-void vst1_qs8(qint8_t *addr, qint8x8_t b);
-
-/** Store a single 16 bit fixed point vector to memory (4 elements)
- *
- * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
- * @param[in] b 16 bit fixed point vector to store
- *
- */
-void vst1_qs16(qint16_t *addr, qint16x4_t b);
-
-/** Store a single 8 bit fixed point vector to memory (16 elements)
- *
- * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
- * @param[in] b 8 bit fixed point vector to store
- *
- */
-void vst1q_qs8(qint8_t *addr, qint8x16_t b);
-
-/** Store a single 16 bit fixed point vector to memory (8 elements)
- *
- * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
- * @param[in] b 16 bit fixed point vector to store
- *
- */
-void vst1q_qs16(qint16_t *addr, qint16x8_t b);
-
-/** Store two 16 bit fixed point vector to memory (8x2 elements)
- *
- * @param[in] addr Memory address where the 16 bit fixed point vectors should be stored
- * @param[in] b 16 bit fixed point vectors to store
- *
- */
-void vst2q_qs16(qint16_t *addr, qint16x8x2_t b);
-
-/** 16 bit fixed point vector saturating narrow (8 elements)
- *
- * @param[in] a 16 bit fixed point vector to convert
- *
- * @return 8 bit fixed point vector
- */
-qint8x8_t vqmovn_q16(qint16x8_t a);
-
-/** 32 bit fixed point vector saturating narrow (4 elements)
- *
- * @param[in] a 32 bit fixed point vector to convert
- *
- * @return 16 bit fixed point vector
- */
-qint16x4_t vqmovn_q32(qint32x4_t a);
-
-/** 8 bit fixed point vector duplicate (8 elements)
- *
- * @param[in] a 8 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint8x8_t vdup_n_qs8(qint8_t a);
-
-/** 16 bit fixed point vector duplicate (4 elements)
- *
- * @param[in] a 16 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint16x4_t vdup_n_qs16(qint16_t a);
-
-/** 8 bit fixed point vector duplicate (16 elements)
- *
- * @param[in] a 8 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint8x16_t vdupq_n_qs8(qint8_t a);
-
-/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
- *
- * @param[in] a floating point value to convert and duplicate
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the vector duplication
- */
-qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
-
-/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
- *
- * @param[in] a floating point value to convert and duplicate
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the vector duplication
- */
-qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position);
-
-/** 16 bit fixed point vector duplicate (8 elements)
- *
- * @param[in] a 16 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint16x8_t vdupq_n_qs16(qint16x8_t a);
-
-/** Absolute value of 8 bit fixed point vector (8 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x8_t vabs_qs8(qint8x8_t a);
-
-/** Absolute value of 16 bit fixed point vector (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x4_t vabs_qs16(qint16x4_t a);
-
-/** Absolute value of 8 bit fixed point vector (16 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x16_t vabsq_qs8(qint8x16_t a);
-
-/** Absolute value of 16 bit fixed point vector (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x8_t vabsq_qs16(qint16x8_t a);
-
-/** Saturating absolute value of 8 bit fixed point vector (8 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x8_t vqabs_qs8(qint8x8_t a);
-
-/** Saturating absolute value of 16 bit fixed point vector (4 elements)
- *
- * @param[in] a 4 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x4_t vqabs_qs16(qint16x4_t a);
-
-/** Saturating absolute value of 8 bit fixed point vector (16 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x16_t vqabsq_qs8(qint8x16_t a);
-
-/** Saturating absolute value of 16 bit fixed point vector (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x8_t vqabsq_qs16(qint16x8_t a);
-
-/** 8 bit fixed point vector max (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector max operation
- */
-qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector max (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector max operation
- */
-qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector max (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector max operation
- */
-qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector max (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector max operation
- */
-qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector pairwise max (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector pairwise max operation
- */
-qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector pairwise max (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector pairwise max operation
- */
-qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector min (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector max operation
- */
-qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector min (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector max operation
- */
-qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector min (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector min operation
- */
-qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector min (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector min operation
- */
-qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector pairwise min (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector pairwise min operation
- */
-qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector pairwise min (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector pairwise min operation
- */
-qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector add (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition
- */
-qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector add (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition
- */
-qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector add (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition
- */
-qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector add (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition
- */
-qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector saturating add (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector saturating add (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector saturating add (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector saturating add (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector saturating pairwise add (8 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
- */
-int16x4_t vpaddl_qs8(qint8x8_t a);
-
-/** 8 bit fixed point vector subtraction (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction
- */
-qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector subtraction (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction
- */
-qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector subtraction (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction
- */
-qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector subtraction (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction
- */
-qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector saturating subtraction (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector saturating subtraction (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector saturating subtraction (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector saturating subtraction (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector multiply (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication.
- */
-qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication.
- */
-qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication.
- */
-qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication.
- */
-qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector long multiply (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point long vector multiplication.
- */
-qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector long multiply (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 32 bit fixed point long vector multiplication.
- */
-qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] c Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate
- */
-qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] c Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate
- */
-qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] c Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate
- */
-qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] c Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate
- */
-qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] c Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
- */
-qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] c Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
- */
-qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] c Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
- */
-qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] c Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
- */
-qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply-accumulate long (8 elements).
- * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
- *
- * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] c Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate long
- */
-qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply-accumulate long (4 elements).
- * This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements
- *
- * @param[in] a First 32 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] c Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate long
- */
-qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
- * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
- *
- * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 8 bit fixed point input vector
- * @param[in] c Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate long
- */
-qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply-accumulate long (4 elements). The saturation is performed on the 16 bit fixed point output vector.
- * This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements
- *
- * @param[in] a First 32 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] c Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate long
- */
-qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
- *
- * @param[in] a Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
- */
-qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
-
-/** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements
- *
- * @param[in] a Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
- */
-qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
-
-/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
- *
- * @param[in] a Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
- */
-qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
-
-/** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements
- *
- * @param[in] a Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
- */
-qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
-
-/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
- *
- * @param[in] a 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 8 bit fixed point -> float32x2x4
- */
-float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Convert a 16 bit fixed point vector with 4 elements to a float vector with 4 elements
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 16 bit fixed point -> float32x2
- */
-float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
- *
- * @param[in] a 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 8 bit fixed point -> float32x4x4
- */
-float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
-
-/** Convert a 16 bit fixed point vector with 8 elements to a float vector with 4x2 elements
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 16 bit fixed point -> float32x4x2
- */
-float32x4x2_t vcvtq_qs16_f32(qint16x8_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit reciprocal (1/a).
- */
-qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit reciprocal (1/a).
- */
-qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit reciprocal (1/a).
- */
-qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit reciprocal (1/a).
- */
-qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Division fixed point 8bit (8 elements)
- *
- * @param[in] a First 8bit fixed point input vector
- * @param[in] b Second 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in fixed point format.
- */
-qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
-
-/** Division fixed point 16 bit (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in fixed point format.
- */
-qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** Division fixed point 8bit (16 elements)
- *
- * @param[in] a First 8bit fixed point input vector
- * @param[in] b Second 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in 8bit fixed point format.
- */
-qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** Division fixed point 16 bit (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in 16 bit fixed point format.
- */
-qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit taylor approximation.
- */
-template <bool islog>
-qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit taylor approximation.
- */
-template <bool islog>
-qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit taylor approximation.
- */
-template <bool islog>
-qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit taylor approximation.
- */
-template <bool islog>
-qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 8bit (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit saturating exponential
- */
-qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 16 bit (4 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit saturating exponential
- */
-qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 8bit (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit saturating exponential
- */
-qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 16 bit (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit saturating exponential
- */
-qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 8 bit (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit logarithm.
- */
-qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 16 bit (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit logarithm.
- */
-qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 16bit (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit logarithm.
- */
-qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 16 bit (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit logarithm.
- */
-qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 16 bit (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 16bit (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate saturating n power for fixed point 8bit (16 elements).
- *
- * pow(a,b) = e^(b*log(a))
- *
- * @param[in] a 8bit fixed point input vector
- * @param[in] b 8bit fixed point power vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit power.
- */
-qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** Calculate saturating n power for fixed point 16bit (8 elements).
- *
- * pow(a,b) = e^(b*log(a))
- *
- * @param[in] a 16bit fixed point input vector
- * @param[in] b 16bit fixed point power vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16bit power.
- */
-qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
*
* @param[in] a Float input vector
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index b86c3cbec3..14e51d825c 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -26,1965 +26,7 @@
namespace arm_compute
{
-/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
- * Format is in Q0.7 for all elements
- */
-static const std::array<qint8x8_t, 4> exp_tab_qs8 =
-{
- {
- vdup_n_s8(0x7F), // 0.9978546
- vdup_n_s8(0x3F), // 0.4994721
- vdup_n_s8(0x16), // 0.1763723
- vdup_n_s8(0x05), // 0.0435108
- }
-};
-
-/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
- * Format is in Q0.15 for all elements
- */
-static const std::array<qint16x4_t, 4> exp_tab_qs16 =
-{
- {
- vdup_n_s16(0x7FBA), // 0.9978546
- vdup_n_s16(0x3FE9), // 0.4994721
- vdup_n_s16(0x1693), // 0.1763723
- vdup_n_s16(0x0592), // 0.0435108
- }
-};
-
-/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
- * Format is in Q0.7 for all elements
- */
-static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
-{
- {
- vdupq_n_s8(0x7F), // 0.9978546
- vdupq_n_s8(0x3F), // 0.4994721
- vdupq_n_s8(0x16), // 0.1763723
- vdupq_n_s8(0x05), // 0.0435108
- }
-};
-
-/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements
- */
-static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
-{
- {
- vdupq_n_s16(0x7FBA), // 0.9978546
- vdupq_n_s16(0x3FE9), // 0.4994721
- vdupq_n_s16(0x1693), // 0.1763723
- vdupq_n_s16(0x0592), // 0.0435108
- }
-};
-
-/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
- * Format is in Q0.7 for all elements except the first one which is in Q1.6
- */
-static const std::array<qint8x8_t, 4> log_tab_qs8 =
-{
- {
- vdup_n_s8(0x5C), // 1.4384189
- vdup_n_s8(-0x56), // -0.6771900
- vdup_n_s8(0x29), // 0.3218538
- vdup_n_s8(-0x0A), // -0.0832229
- }
-};
-
-/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements except the first one which is in Q1.14
- */
-static const std::array<qint16x4_t, 4> log_tab_qs16 =
-{
- {
- vdup_n_s16(0x5C0F), // 1.4384189
- vdup_n_s16(-0x56AE), // -0.6771900
- vdup_n_s16(0x2933), // 0.3218538
- vdup_n_s16(-0x0AA7), // -0.0832229
- }
-};
-
-/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
- * Format is in Q0.7 for all elements except the first one which is in Q1.6
- */
-static const std::array<qint8x16_t, 4> log_tabq_qs8 =
-{
- {
- vdupq_n_s8(0x5C), // 1.4384189
- vdupq_n_s8(-0x56), // -0.6771900
- vdupq_n_s8(0x29), // 0.3218538
- vdupq_n_s8(-0x0A), // -0.0832229
- }
-};
-
-/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- * Format is in Q0.15 for all elements except the first one which is in Q1.14
- */
-static const std::array<qint16x8_t, 4> log_tabq_qs16 =
-{
- {
- vdupq_n_s16(0x5C0F), // 1.4384189
- vdupq_n_s16(-0x56AE), // -0.6771900
- vdupq_n_s16(0x2933), // 0.3218538
- vdupq_n_s16(-0x0AA7), // -0.0832229
- }
-};
-
#ifndef DOXYGEN_SKIP_THIS
-inline qint8x8_t vget_low_qs8(qint8x16_t a)
-{
- return vget_low_s8(a);
-}
-
-inline qint16x4_t vget_low_qs16(qint16x8_t a)
-{
- return vget_low_s16(a);
-}
-
-inline qint8x8_t vget_high_qs8(qint8x16_t a)
-{
- return vget_high_s8(a);
-}
-
-inline qint16x4_t vget_high_qs16(qint16x8_t a)
-{
- return vget_high_s16(a);
-}
-
-inline qint8x8_t vld1_qs8(const qint8_t *addr)
-{
- return vld1_s8(addr);
-}
-
-inline qint16x4_t vld1_qs16(const qint16_t *addr)
-{
- return vld1_s16(addr);
-}
-
-inline qint8x16_t vld1q_qs8(const qint8_t *addr)
-{
- return vld1q_s8(addr);
-}
-
-inline qint16x8_t vld1q_qs16(const qint16_t *addr)
-{
- return vld1q_s16(addr);
-}
-
-inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
-{
- return vld1_dup_s8(addr);
-}
-
-inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
-{
- return vld1_dup_s16(addr);
-}
-
-inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
-{
- return vld1q_dup_s8(addr);
-}
-
-inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
-{
- return vld1q_dup_s16(addr);
-}
-
-inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
-{
- return vld2q_s16(addr);
-}
-
-inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
-{
- vst1_s8(addr, b);
-}
-
-inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
-{
- vst1_s16(addr, b);
-}
-
-inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
-{
- vst1q_s8(addr, b);
-}
-
-inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
-{
- vst1q_s16(addr, b);
-}
-
-inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
-{
- vst2q_s16(addr, b);
-}
-
-inline qint8x8_t vqmovn_qs16(qint16x8_t a)
-{
- return vqmovn_s16(a);
-}
-
-inline qint16x4_t vqmovn_qs32(qint32x4_t a)
-{
- return vqmovn_s32(a);
-}
-
-inline qint8x8_t vdup_n_qs8(qint8_t a)
-{
- return vdup_n_s8(a);
-}
-
-inline qint16x4_t vdup_n_qs16(qint16_t a)
-{
- return vdup_n_s16(a);
-}
-
-inline qint8x16_t vdupq_n_qs8(qint8_t a)
-{
- return vdupq_n_s8(a);
-}
-
-inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
-{
- float32x4x4_t res =
- {
- {
- vdupq_n_f32(a),
- vdupq_n_f32(a),
- vdupq_n_f32(a),
- vdupq_n_f32(a),
- }
- };
- return vqcvtq_qs8_f32(res, fixed_point_position);
-}
-
-inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
-{
- float32x4x2_t res =
- {
- {
- vdupq_n_f32(a),
- vdupq_n_f32(a),
- }
- };
- return vqcvtq_qs16_f32(res, fixed_point_position);
-}
-
-inline qint16x8_t vdupq_n_qs16(qint16_t a)
-{
- return vdupq_n_s16(a);
-}
-
-inline qint32x4_t vdupq_n_qs32(qint32_t a)
-{
- return vdupq_n_s32(a);
-}
-
-inline qint8x8_t vabs_qs8(qint8x8_t a)
-{
- return vabs_s8(a);
-}
-
-inline qint16x4_t vabs_qs16(qint16x4_t a)
-{
- return vabs_s16(a);
-}
-
-inline qint8x16_t vabsq_qs8(qint8x16_t a)
-{
- return vabsq_s8(a);
-}
-
-inline qint16x8_t vabsq_qs16(qint16x8_t a)
-{
- return vabsq_s16(a);
-}
-
-inline qint8x8_t vqabs_qs8(qint8x8_t a)
-{
- return vqabs_s8(a);
-}
-
-inline qint16x4_t vqabs_qs16(qint16x4_t a)
-{
- return vqabs_s16(a);
-}
-
-inline qint8x16_t vqabsq_qs8(qint8x16_t a)
-{
- return vqabsq_s8(a);
-}
-
-inline qint16x8_t vqabsq_qs16(qint16x8_t a)
-{
- return vqabsq_s16(a);
-}
-
-inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vmax_s8(a, b);
-}
-
-inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vmax_s16(a, b);
-}
-
-inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
-{
- return vmaxq_s8(a, b);
-}
-
-inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vpmax_s8(a, b);
-}
-
-inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vpmax_s16(a, b);
-}
-
-inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
-{
- return vmaxq_s16(a, b);
-}
-
-inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vmin_s8(a, b);
-}
-
-inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vmin_s16(a, b);
-}
-
-inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
-{
- return vminq_s8(a, b);
-}
-
-inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vpmin_s8(a, b);
-}
-
-inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vpmin_s16(a, b);
-}
-
-inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
-{
- return vminq_s16(a, b);
-}
-
-inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vadd_s8(a, b);
-}
-
-inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vadd_s16(a, b);
-}
-
-inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
-{
- return vaddq_s8(a, b);
-}
-
-inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
-{
- return vaddq_s16(a, b);
-}
-
-inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vqadd_s8(a, b);
-}
-
-inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vqadd_s16(a, b);
-}
-
-inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
-{
- return vqadd_s32(a, b);
-}
-
-inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
-{
- return vqaddq_s8(a, b);
-}
-
-inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
-{
- return vqaddq_s16(a, b);
-}
-
-inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
-{
- return vqaddq_s32(a, b);
-}
-
-inline int16x4_t vpaddl_qs8(qint8x8_t a)
-{
- return vpaddl_s8(a);
-}
-
-inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vsub_s8(a, b);
-}
-
-inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vsub_s16(a, b);
-}
-
-inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
-{
- return vsubq_s8(a, b);
-}
-
-inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
-{
- return vsubq_s16(a, b);
-}
-
-inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
-{
- return vqsub_s8(a, b);
-}
-
-inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
-{
- return vqsub_s16(a, b);
-}
-
-inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
-{
- return vqsubq_s8(a, b);
-}
-
-inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
-{
- return vqsubq_s16(a, b);
-}
-
-inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary result with a constant used to round up the result
- qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- res = vmlal_s8(res, a, b);
-
- // Shift right by fixed_point_position
- res = vshlq_s16(res, fixed_point_position_s16);
-
- // Convert back to qint8
- return vmovn_s16(res);
-}
-
-inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary result with a constant used to round up the result
- qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- res = vmlal_s16(res, a, b);
-
- // Shift right by fixed_point_position
- res = vshlq_s32(res, fixed_point_position_s32);
-
- // Convert back to qint16
- return vmovn_s32(res);
-}
-
-inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
- qint16x8_t res1 = res0;
-
- // Vector multiply-accumulate long
- res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
- res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
-
- // Shift right by fixed_point_position
- res0 = vshlq_s16(res0, fixed_point_position_s16);
- res1 = vshlq_s16(res1, fixed_point_position_s16);
-
- // Convert back to qint8
- return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
-}
-
-inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
- qint32x4_t res1 = res0;
-
- // Vector multiply-accumulate long
- res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
- res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
-
- // Shift right by fixed_point_position
- res0 = vshlq_s32(res0, fixed_point_position_s32);
- res1 = vshlq_s32(res1, fixed_point_position_s32);
-
- // Convert back to qint16
- return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
-}
-
-inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary result with a constant used to round up the result
- qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- res = vmlal_s8(res, a, b);
-
- // Shift right by fixed_point_position
- res = vqshlq_s16(res, fixed_point_position_s16);
-
- // Convert back to qint8 and saturate
- return vqmovn_s16(res);
-}
-
-inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary result with a constant used to round up the result
- qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- res = vmlal_s16(res, a, b);
-
- // Shift right by fixed_point_position
- res = vqshlq_s32(res, fixed_point_position_s32);
-
- // Convert back to qint16 and saturate
- return vqmovn_s32(res);
-}
-
-inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
- qint16x8_t res1 = res0;
-
- // Vector multiply-accumulate long
- res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
- res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
-
- // Shift right by fixed_point_position
- res0 = vqshlq_s16(res0, fixed_point_position_s16);
- res1 = vqshlq_s16(res1, fixed_point_position_s16);
-
- // Convert back to qint8 and saturate
- return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
-}
-
-inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
- qint32x4_t res1 = res0;
-
- // Vector multiply-accumulate long
- res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
- res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
-
- // Shift right by fixed_point_position
- res0 = vqshlq_s32(res0, fixed_point_position_s32);
- res1 = vqshlq_s32(res1, fixed_point_position_s32);
-
- // Convert back to qint16 and saturate
- return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
-}
-
-inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- qint16x8_t res = vmull_s8(a, b);
-
- return vqrshlq_s16(res, fixed_point_position_s16);
-}
-
-inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmull_s16(a, b);
-
- // Shift right by fixed_point_position
- return vqshlq_s32(tmp, fixed_point_position_s32);
-}
-
-inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s8(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vshlq_s16(tmp, fixed_point_position_s16);
-
- // Convert back to qint8 and accumulate
- return vadd_s8(a, vmovn_s16(tmp));
-}
-
-inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s16(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vshlq_s32(tmp, fixed_point_position_s32);
-
- // Convert back to qint16 and accumulate
- return vadd_s16(a, vmovn_s32(tmp));
-}
-
-inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
- qint16x8_t tmp1 = tmp0;
-
- // Vector multiply-accumulate long
- tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
- tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
-
- // Shift right by fixed_point_position
- tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
- tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
-
- // Convert back to qint8 and accumulate
- return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
-}
-
-inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
- qint32x4_t tmp1 = tmp0;
-
- // Vector multiply-accumulate long
- tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
- tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
-
- // Shift right by fixed_point_position
- tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
- tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
-
- // Convert back to qint16 and accumulate
- return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
-}
-
-inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s8(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vqshlq_s16(tmp, fixed_point_position_s16);
-
- // Convert back to qint8 and accumulate
- return vqadd_s8(a, vqmovn_s16(tmp));
-}
-
-inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s16(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vqshlq_s32(tmp, fixed_point_position_s32);
-
- // Convert back to qint8 and accumulate
- return vqadd_s16(a, vqmovn_s32(tmp));
-}
-
-inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
- qint16x8_t tmp1 = tmp0;
-
- // Vector multiply-accumulate long
- tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
- tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
-
- // Shift right by fixed_point_position
- tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
- tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
-
- // Convert back to qint8 and accumulate
- qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
- return vqaddq_s8(a, res);
-}
-
-inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
- qint32x4_t tmp1 = tmp0;
-
- // Vector multiply-accumulate long
- tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
- tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
-
- // Shift right by fixed_point_position
- tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
- tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
-
- // Convert back to qint16 and accumulate
- qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
- return vqaddq_s16(a, res);
-}
-
-inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s8(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vshlq_s16(tmp, fixed_point_position_s16);
-
- // Accumulate
- return vaddq_s16(a, tmp);
-}
-
-inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s16(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vshlq_s32(tmp, fixed_point_position_s32);
-
- // Accumulate
- return vaddq_s32(a, tmp);
-}
-
-inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
- const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s8(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vqshlq_s16(tmp, fixed_point_position_s16);
-
- // Accumulate
- return vqaddq_s16(a, tmp);
-}
-
-inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
- const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
- // Initialize the temporary results with a constant used to round up the result
- qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
- // Vector multiply-accumulate long
- tmp = vmlal_s16(tmp, b, c);
-
- // Shift right by fixed_point_position
- tmp = vqshlq_s32(tmp, fixed_point_position_s32);
-
- // Accumulate
- return vqaddq_s32(a, tmp);
-}
-
-inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
- float32x4x2_t res_f32 =
- {
- {
- vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
- vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
- }
- };
-
- res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
- res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
-
- const int32x4x2_t res_s32 =
- {
- {
- vcvtq_s32_f32(res_f32.val[0]),
- vcvtq_s32_f32(res_f32.val[1]),
- }
- };
-
- const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
-
- return vqmovn_s16(res_s16);
-}
-
-inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
- float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
-
- res_f32 = vmlaq_f32(res_f32, a, pow2);
-
- const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
-
- return vqmovn_s32(res_s32);
-}
-
-inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
- float32x4x4_t res_f32 =
- {
- {
- vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
- vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
- vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
- vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
- }
- };
-
- res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
- res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
- res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
- res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
-
- const int32x4x4_t res_s32 =
- {
- {
- vcvtq_s32_f32(res_f32.val[0]),
- vcvtq_s32_f32(res_f32.val[1]),
- vcvtq_s32_f32(res_f32.val[2]),
- vcvtq_s32_f32(res_f32.val[3]),
- }
- };
-
- const int16x8x2_t res_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
- vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
- }
- };
-
- return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
-}
-
-inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
- float32x4x2_t res_f32 =
- {
- {
- vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
- vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
- }
- };
-
- res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
- res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
-
- const int32x4x2_t res_s32 =
- {
- {
- vcvtq_s32_f32(res_f32.val[0]),
- vcvtq_s32_f32(res_f32.val[1])
- }
- };
-
- return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
-}
-
-inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-
- const int16x8_t res_s16 = vmovl_s8(a);
-
- const int32x4x2_t res_s32 =
- {
- {
- vmovl_s16(vget_low_qs16(res_s16)),
- vmovl_s16(vget_high_qs16(res_s16))
- }
- };
-
- float32x4x2_t res_f32 =
- {
- {
- vcvtq_f32_s32(res_s32.val[0]),
- vcvtq_f32_s32(res_s32.val[1])
- }
- };
-
- res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
- res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
-
- return res_f32;
-}
-
-inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
- const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
-
- return vmulq_f32(res_f32, pow2);
-}
-
-inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-
- const int16x8x2_t res_s16 =
- {
- {
- vmovl_s8(vget_low_s8(a)),
- vmovl_s8(vget_high_s8(a)),
- }
- };
-
- const int32x4x4_t res_s32 =
- {
- {
- vmovl_s16(vget_low_qs16(res_s16.val[0])),
- vmovl_s16(vget_high_qs16(res_s16.val[0])),
- vmovl_s16(vget_low_qs16(res_s16.val[1])),
- vmovl_s16(vget_high_qs16(res_s16.val[1])),
- }
- };
-
- float32x4x4_t res_f32 =
- {
- {
- vcvtq_f32_s32(res_s32.val[0]),
- vcvtq_f32_s32(res_s32.val[1]),
- vcvtq_f32_s32(res_s32.val[2]),
- vcvtq_f32_s32(res_s32.val[3])
- }
- };
-
- res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
- res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
- res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
- res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
-
- return res_f32;
-}
-
-inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
-{
- const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-
- const int32x4x2_t res_s32 =
- {
- {
- vmovl_s16(vget_low_qs16(a)),
- vmovl_s16(vget_high_qs16(a))
- }
- };
-
- float32x4x2_t res_f32 =
- {
- {
- vcvtq_f32_s32(res_s32.val[0]),
- vcvtq_f32_s32(res_s32.val[1])
- }
- };
-
- res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
- res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
-
- return res_f32;
-}
-
-inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
- const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
- const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
- const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
- const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
-
- // Find shift value
- const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
- const qint8x8_t temp = vshl_s8(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
-
- uint8x8_t set_one = vcgt_s8(x, const_one);
- x = vbsl_s8(set_one, const_one, x);
-
- // Use three iterations of Newton-Raphson method to get the result
- x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
- return vshl_s8(x, shift_value);
-}
-
-inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
- const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
- const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
- const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
- const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
-
- // Find shift value
- const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
- const qint16x4_t temp = vshl_s16(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
-
- uint16x4_t set_one = vcgt_s16(x, const_one);
- x = vbsl_s16(set_one, const_one, x);
-
- // Use four iterations of Newton-Raphson method to get the result
- x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
- return vshl_s16(x, shift_value);
-}
-
-inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
- const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
- const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
- const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
- const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
-
- // Find shift value
- const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
- const qint8x8_t temp = vqshl_s8(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
-
- uint8x8_t set_one = vcgt_s8(x, const_one);
- x = vbsl_s8(set_one, const_one, x);
-
- // Use three iterations of Newton-Raphson method to get the result
- x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
- return vqshl_s8(x, shift_value);
-}
-
-inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
- const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
- const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
- const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
- const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
-
- // Find shift value
- const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
- const qint16x4_t temp = vqshl_s16(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
-
- uint16x4_t set_one = vcgt_s16(x, const_one);
- x = vbsl_s16(set_one, const_one, x);
-
- // Use four iterations of Newton-Raphson method to get the result
- x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
- return vqshl_s16(x, shift_value);
-}
-
-inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
- const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
- const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
- const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
- const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
-
- // Find shift value
- const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
- const qint8x16_t temp = vshlq_s8(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
-
- // Set initial guess to one if x > 1
- uint8x16_t set_one = vcgtq_s8(x, const_one);
- x = vbslq_s8(set_one, const_one, x);
-
- // Use three iterations of Newton-Raphson method to get the result
- x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
- return vshlq_s8(x, shift_value);
-}
-
-inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
- const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
- const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
- const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
- const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
-
- // Find shift value
- const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
- const qint16x8_t temp = vshlq_s16(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
-
- // Set initial guess to one if x > 1
- uint16x8_t set_one = vcgtq_s16(x, const_one);
- x = vbslq_s16(set_one, const_one, x);
-
- // Use four iterations of Newton-Raphson method to get the result
- x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
- return vshlq_s16(x, shift_value);
-}
-
-inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
- const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
- const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
- const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
- const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
-
- // Find shift value
- const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
- const qint8x16_t temp = vqshlq_s8(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
-
- // Set initial guess to one if x > 1
- uint8x16_t set_one = vcgtq_s8(x, const_one);
- x = vbslq_s8(set_one, const_one, x);
-
- // Use three iterations of Newton-Raphson method to get the result
- x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
- return vqshlq_s8(x, shift_value);
-}
-
-inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
-{
- // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
- const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
- const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
- const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
- const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
-
- // Find shift value
- const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
- const qint16x8_t temp = vqshlq_s16(a, shift_value);
-
- // Newton-Raphson division initial estimate X0 calculation
- qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
-
- // Set initial guess to one if x > 1
- uint16x8_t set_one = vcgtq_s16(x, const_one);
- x = vbslq_s16(set_one, const_one, x);
-
- // Use four iterations of Newton-Raphson method to get the result
- x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
- x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
- // Saturate result in case of overflow
- return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
-}
-
-inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
- return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
-}
-
-inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
- return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
-}
-
-inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
- return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
-}
-
-inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
- return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
-}
-
-template <bool islog>
-inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
- const qint8x8_t const_one = vdup_n_s8(1);
- const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
- const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
- const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
- const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
- const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
- const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
- const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
- const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
- const qint16x4_t const_one = vdup_n_s16(1);
- const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
- const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
- const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
- const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
- const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
- const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
- const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
- const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
- const qint8x8_t const_one = vdup_n_s8(1);
- const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
- const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
- const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
- const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
- const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
- const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
- const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
- const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
- const qint16x4_t const_one = vdup_n_s16(1);
- const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
- const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
- const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
- const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
- const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
- const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
- const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
- const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
- const qint8x16_t const_one = vdupq_n_s8(1);
- const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
- const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
- const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
- const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
- const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
- const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
- const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
- const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
- const qint16x8_t const_one = vdupq_n_s16(1);
- const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
- const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
- const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
- const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
- const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
- const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
- const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
- const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
- const qint8x16_t const_one = vdupq_n_s8(1);
- const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
- const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
- const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
- const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
- const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
- const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
- const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
- const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
- return res;
-}
-
-template <bool islog>
-inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
- const qint16x8_t const_one = vdupq_n_s16(1);
- const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
- const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
- const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
- const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
- const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
- const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
- const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
- const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
- return res;
-}
-
-inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
- const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
- const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
- const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
-
- // Perform range reduction [-log(2),log(2)]
- const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
- // get decimal part from m
- const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
-
- qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
- alpha = vqabs_qs8(vqsub_s8(a, alpha));
-
- // Polynomial Approximation
- qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
- poly = vqadd_s8(poly, const_one);
-
- // Reconstruct
- poly = vqshl_s8(poly, dec_m);
-
- return poly;
-}
-
-inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
- const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
- const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
- const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
-
- // Perform range reduction [-log(2),log(2)]
- const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
- // get decimal part from m
- const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
-
- qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
- alpha = vqabs_qs16(vqsub_s16(a, alpha));
-
- // Polynomial Approximation
- qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
- poly = vqadd_s16(poly, const_one);
-
- // Reconstruct
- poly = vqshl_s16(poly, dec_m);
-
- return poly;
-}
-
-inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
- const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
- const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
- const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
-
- // Perform range reduction [-log(2),log(2)]
- const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
- // get decimal part from m
- const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
-
- qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
- alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
-
- // Polynomial Approximation
- qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
- poly = vqaddq_s8(poly, const_one);
-
- // Reconstruct
- poly = vqshlq_s8(poly, dec_m);
-
- return poly;
-}
-
-inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
- const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
- const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
- const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
-
- // Perform range reduction [-log(2),log(2)]
- const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
- // get decimal part from m
- const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
-
- qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
- alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
-
- // Polynomial Approximation
- qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
- poly = vqaddq_s16(poly, const_one);
-
- // Reconstruct
- poly = vqshlq_s16(poly, dec_m);
-
- return poly;
-}
-
-inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
- const qint8x8_t const_seven_dec = vdup_n_s8(7);
- const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
-
- // If 0 < a < 1, calculate log(1/x)
- uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
- qint8x8_t recip = vdup_n_s8(0);
- recip = vbsl_s8(calc_reciprocal, recip, a);
-
- // Calculate reciprocal
- recip = vrecip_qs8(recip, fixed_point_position);
- a = vbsl_s8(calc_reciprocal, recip, a);
-
- // Get decimal part of a
- qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
- qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
-
- // Get exponent of 2^n which is equal or less than dec_a
- shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
-
- // Get x to range (1, 2]
- const qint8x8_t shift_value_neg = vneg_s8(shift_value);
- const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
- const qint8x8_t sum = vmul_s8(shift_value, const_one);
-
- // Polynomial Approximation
- qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
-
- // Reconstruct
- poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
-
- // Set negative value for 0 < a < 1
- poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
-
- return poly;
-}
-
-inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
- const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
- const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
-
- // If 0 < a < 1, calculate log(1/x)
- uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
- qint16x4_t recip = vdup_n_s16(0);
- recip = vbsl_s16(calc_reciprocal, recip, a);
-
- // Calculate reciprocal
- recip = vrecip_qs16(recip, fixed_point_position);
- a = vbsl_s16(calc_reciprocal, recip, a);
-
- // Get decimal part of a
- qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
- qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
-
- // Get exponent of 2^n which is equal or less than dec_a
- shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
-
- // Get x to range (1, 2]
- const qint16x4_t shift_value_neg = vneg_s16(shift_value);
- const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
- const qint16x4_t sum = vmul_s16(shift_value, const_one);
-
- // Polynomial Approximation
- qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
-
- // Reconstruct
- poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
-
- // Set negative value for 0 < a < 1
- poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
-
- return poly;
-}
-
-inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
- const qint8x16_t const_seven_dec = vdupq_n_s8(7);
- const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
-
- // If 0 < a < 1, calculate log(1/x)
- uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
- qint8x16_t recip = vdupq_n_s8(0);
- recip = vbslq_s8(calc_reciprocal, a, recip);
-
- // Calculate reciprocal
- recip = vrecipq_qs8(recip, fixed_point_position);
- a = vbslq_s8(calc_reciprocal, recip, a);
-
- // Get decimal part of a
- qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
- qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
-
- // Get exponent of 2^n which is equal or less than dec_a
- shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
-
- // Get x to range (1, 2]
- const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
- const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
- const qint8x16_t sum = vmulq_s8(shift_value, const_one);
-
- // Polynomial Approximation
- qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
-
- // Reconstruct
- poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
-
- // Set negative value for 0 < a < 1
- poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
-
- return poly;
-}
-
-inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
- const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
- const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
-
- // If 0 < a < 1, calculate log(1/x)
- uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
- qint16x8_t recip = vdupq_n_s16(0);
- recip = vbslq_s16(calc_reciprocal, a, recip);
-
- // Calculate reciprocal
- recip = vqrecipq_qs16(recip, fixed_point_position);
- a = vbslq_s16(calc_reciprocal, recip, a);
-
- // Get decimal part of a
- qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
- qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
-
- // Get exponent of 2^n which is equal or less than dec_a
- shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
-
- // Get x to range (1, 2]
- const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
- const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
- const qint16x8_t sum = vmulq_s16(shift_value, const_one);
-
- // Polynomial Approximation
- qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
-
- // Reconstruct
- poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
-
- // Set negative value for 0 < a < 1
- poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
-
- return poly;
-}
-
-inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
- uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
- temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
- qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
-
- temp = vshl_s8(a, shift_value);
-
- // Initial guess
- qint8x8_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After three iterations we have the result for 8 bit
- x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vshl_s8(x, shift_value2);
-}
-
-inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
- uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
- temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
- qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
-
- temp = vshl_s16(a, shift_value);
-
- // Initial guess
- qint16x4_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After five iterations we have the result for 8 bit
- x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vshl_s16(x, shift_value2);
-}
-
-inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
- uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
- temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
- qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
-
- temp = vqshl_s8(a, shift_value);
-
- // Initial guess
- qint8x8_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After three iterations we have the result for 8 bit
- x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vqshl_s8(x, shift_value2);
-}
-
-inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
- uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
- temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
- qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
-
- temp = vqshl_s16(a, shift_value);
-
- // Initial guess
- qint16x4_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After five iterations we have the result for 16 bit
- x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vqshl_s16(x, shift_value2);
-}
-
-inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
- uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
- temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
- qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
-
- temp = vshlq_s8(a, shift_value);
-
- // Initial guess
- qint8x16_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After three iterations we have the result for 8 bit
- x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vshlq_s8(x, shift_value2);
-}
-
-inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
- uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
- temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
- qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
-
- temp = vshlq_s16(a, shift_value);
-
- // Initial guess
- qint16x8_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After five iterations we have the result for 16 bit
- x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vshlq_s16(x, shift_value2);
-}
-
-inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
- uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
- temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
- qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
-
- temp = vqshlq_s8(a, shift_value);
-
- // Initial guess
- qint8x16_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After three iterations we have the result for 8 bit
- x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vqshlq_s8(x, shift_value2);
-}
-
-inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
-
- // Find shift value. Number must be in (0.5, 2) range.
- qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
-
- // Add one when the shift value is negative in order to get the correct result when we shift right with 1
- qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
- uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
- temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
- qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
-
- temp = vqshlq_s16(a, shift_value);
-
- // Initial guess
- qint16x8_t x = temp;
-
- // Calculate (x / 2) * (3 - a * x^2)
- // After five iterations we have the result for 16 bit
- x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
- x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
- return vqshlq_s16(x, shift_value2);
-}
-
-inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
-{
- const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
- const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
-
- const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
- const qint8x8_t num = vqsub_qs8(exp2x, const_one);
- const qint8x8_t den = vqadd_qs8(exp2x, const_one);
- const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
-
- return tanh;
-}
-
-inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
-{
- const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
- const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
-
- const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
- const qint16x4_t num = vqsub_qs16(exp2x, const_one);
- const qint16x4_t den = vqadd_qs16(exp2x, const_one);
- const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
-
- return tanh;
-}
-
-inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
-{
- const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
- const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
-
- const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
- const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
- const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
- const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
-
- return tanh;
-}
-
-inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
-{
- const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
- const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
-
- const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
- const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
- const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
- const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
-
- return tanh;
-}
-
-inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
- return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
-}
-
-inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
- return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
-}
inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
{
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 06a0a01782..0290e32085 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -24,7 +24,6 @@
#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/NEON/INEKernel.h"
#include "arm_compute/core/QAsymm8.h"
@@ -59,7 +58,7 @@ public:
* @note If the output tensor is a nullptr, the activation function will be performed in-place
*
* @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * of the activation function. Data types supported: QASYMM8/F16/F32.
* @param[out] output Destination tensor. Data type supported: same as @p input
* @param[in] activation_info Activation layer information.
*/
@@ -67,7 +66,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
*
* @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * of the activation function. Data types supported: QASYMM8/F16/F32.
* @param[in] output Destination tensor info. Data type supported: same as @p input
* @param[in] act_info Activation layer information.
*
@@ -104,19 +103,7 @@ private:
* @param[in] window Region on which to execute the kernel
*/
template <ActivationLayerInfo::ActivationFunction F, typename T>
- typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
- /** Function to apply an activation function on a tensor.
- *
- * @param[in] window Region on which to execute the kernel
- */
- template <ActivationLayerInfo::ActivationFunction F, typename T>
typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
- /** Function to apply an activation function on a tensor.
- *
- * @param[in] window Region on which to execute the kernel
- */
- template <ActivationLayerInfo::ActivationFunction F, typename T>
- typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window);
private:
ITensor *_input;
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index 155e792f5d..8cf21eae9d 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -57,26 +57,24 @@ public:
* Valid configurations (Input1,Input2) -> Output :
*
* - (U8,U8) -> U8
- * - (QS8,QS8) -> QS8
* - (U8,U8) -> S16
* - (S16,U8) -> S16
* - (U8,S16) -> S16
* - (S16,S16) -> S16
- * - (QS16,QS16) -> QS16
* - (F16,F16) -> F16
* - (F32,F32) -> F32
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
* @param[in] policy Overflow policy.
*/
void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
/** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] output The output tensor. Data types supported: U8/S16/F16/F32.
* @param[in] policy Overflow policy.
*
* @return a status
@@ -90,9 +88,9 @@ public:
private:
/** Common signature for all the specialised add functions
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
* @param[in] window Region on which to execute the kernel.
*/
using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index 73ecfcfeb5..3e93922b65 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -57,26 +57,24 @@ public:
* Valid configurations (Input1,Input2) -> Output :
*
* - (U8,U8) -> U8
- * - (QS8,QS8) -> QS8
* - (U8,U8) -> S16
* - (S16,U8) -> S16
* - (U8,S16) -> S16
* - (S16,S16) -> S16
- * - (QS16,QS16) -> QS16
* - (F16,F16) -> F16
* - (F32,F32) -> F32
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
* @param[in] policy Overflow policy.
*/
void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
/** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
*
- * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+ * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32
+ * @param[in] output Output tensor. Data types supported: U8/S16/F16/F32
* @param[in] policy Policy to use to handle overflow.
*
* @return a status
@@ -89,9 +87,9 @@ public:
private:
/** Common signature for all the specialised sub functions
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
* @param[in] window Region on which to execute the kernel.
*/
using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 2d33f87dfa..2a540c151b 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -57,7 +57,7 @@ public:
*
* @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
* 3 lower dimensions represent a single input with dimensions [width, height, FM].
- * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+ * The rest are optional and used for representing batches. Data types supported: F16/F32.
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
* @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
* @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -72,7 +72,7 @@ public:
*
* @param[in] input Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
* 3 lower dimensions represent a single input with dimensions [width, height, FM].
- * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+ * The rest are optional and used for representing batches. Data types supported: F16/F32.
* @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
* @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
* @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -96,22 +96,7 @@ private:
void configure_non_fused();
/** Configure execution function in case of fused activation **/
void configure_fused();
- /** Template function to run batch normalization on 8-bit fixed point
- *
- * @tparam fused_activation Boolean that flags if its a fused activation or not
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template <bool fused_activation>
- void batch_normalization_qs8(const Window &window);
- /** Template function to run batch normalization on 16-bit fixed point
- *
- * @tparam fused_activation Boolean that flags if its a fused activation or not
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template <bool fused_activation>
- void batch_normalization_qs16(const Window &window);
+
/** Template function to run batch normalization on fp16
*
* @tparam fused_activation Boolean that flags if its a fused activation or not
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
index 9fb493cc4f..f02858e7d9 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
@@ -72,7 +72,7 @@ public:
/** Set the input and output of the kernel.
*
- * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
* while the rest represent batch of outputs. Data types supported: Same as @p input
* @param[in] convolved_dims Output convolved dimensions.
@@ -80,7 +80,7 @@ public:
void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
/** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
*
- * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
* while the rest represent batch of outputs. Data types supported: Same as @p input
* @param[in] convolved_dims Output convolved dimensions.
diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
index 65ce764246..d5c9e3bbe9 100644
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
@@ -59,7 +59,7 @@ public:
~NEConvertFullyConnectedWeightsKernel() = default;
/** Set the input and output tensor.
*
- * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+ * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
* @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input.
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
* @param[in] data_layout The data layout the weights have been trained in.
@@ -67,7 +67,7 @@ public:
void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel
*
- * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+ * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
* @param[in] output The converted weights tensor info. Shape and Data Type: Same as @p input.
* @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
* @param[in] data_layout The data layout the weights have been trained in.
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
index 67ef5293b7..12a5051ef8 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
@@ -55,7 +55,7 @@ public:
~NEDepthConcatenateLayerKernel() = default;
/** Initialise the kernel's inputs and output
*
- * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32.
+ * @param[in] input Input tensor. Data types supported: F16/F32.
* @param[in] depth_offset The offset on the Z axis.
* @param[in,out] output Output tensor. Data types supported: Same as @p input.
*
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
index 50536f2b47..77bb0413ca 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
@@ -55,19 +55,12 @@ public:
*
* Valid conversions Input -> Output :
*
- * - QS8 -> QS8, F32
* - U8 -> U16, S16, S32
* - U16 -> U8, U32
* - S16 -> U8, S32
- * - QS16 -> QS16, F32
- * - F32 -> QS8
*
- * @warning In case of in-place fixed point position conversion make sure that configure has been called
- * before the updated tensor is used in other functions, as the TensorInfo of the tensor will be
- * altered. In-place is only supported for QS8 -> QS8, QS16 -> QS16.
- *
- * @param[in, out] input The input tensor to convert (Written in case of in-place computation). Data types supported: U8/QS8/U16/S16/F32.
- * @param[out] output The output tensor. Can be null in case of in-place computation. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
+ * @param[in, out] input The input tensor to convert (Written in case of in-place computation). Data types supported: U8/U16/S16.
+ * @param[out] output The output tensor. Can be null in case of in-place computation. Data types supported: U8/U16/S16/U32/S32/F32.
* @param[in] policy Conversion policy.
* @param[in] shift (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
* In case of fixed point position conversion, it specifies the new fixed point position, if operation is in-place.
@@ -82,8 +75,6 @@ private:
ITensor *_output;
ConvertPolicy _policy;
uint32_t _shift;
- int _fixed_point_position_input;
- int _fixed_point_position_output;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index f859f97dae..589725ab01 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -57,24 +57,24 @@ public:
* 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
*
* @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
* @param[out] output Output tensor.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
+ * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
*/
void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
/** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
*
* @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported:Same as @p input.
* @param[in] output Output tensor.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
+ * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS32/F16/F32
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
*
* @return a status
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index 77711d7ecd..7fd1d70374 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
@@ -55,10 +55,10 @@ public:
/** Set the accumulate buffer and the biases of the kernel.
*
* @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: QS16/QS32/F16/F32
+ * Data type supported: QS32/F16/F32
* @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
* @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
- * Data type supported: QS8/QS16/F16/F32
+ * Data type supported: F16/F32
* @param[in] result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
* @param[in] result_shift (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
* @param[in] result_offset_after_shift (Optional)Offset to be applied to result before converting it back to QASYMM8
@@ -68,10 +68,10 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
*
* @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place.
- * Data type supported: QS16/QS32/F16/F32
+ * Data type supported: QS32/F16/F32
* @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
* @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
- * Data type supported: QS8/QS16/F16/F32
+ * Data type supported: F16/F32
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr);
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
index dd19b8f35a..cff6b4ea2d 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
@@ -57,7 +57,7 @@ public:
*
* @note This kernel fills the borders within the XY-planes.
*
- * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QS8/QASYMM8/QS16/S16/S32/F32.
+ * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QASYMM8/S16/S32/F32.
* @param[in] border_size Size of the border to fill in elements.
* @param[in] border_mode Border mode to use for the convolution.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
index 545a265dc2..2b6c7af72a 100644
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
@@ -57,7 +57,7 @@ public:
*
* @note This kernel fills the borders within the XY-planes.
*
- * @param[in,out] input Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
+ * @param[in,out] input Tensor to process. Data types supported: U8/S16/S32/F32.
* @param[in] border_size Size of the border to fill in elements.
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 79504fd4da..5c0104d138 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -60,13 +60,13 @@ public:
NEGEMMInterleave4x4Kernel();
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
*
* @return a status
@@ -79,7 +79,7 @@ public:
private:
/** Common signature for all the transpose functions
*
- * @param[in] input An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output The output tensor. Data type supported: same as @p input
* @param[in] window Region on which to execute the kernel.
*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
index e48a9a77e4..419a9f9150 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -51,13 +51,13 @@ public:
~NEGEMMMatrixAccumulateBiasesKernel() = default;
/** Set the accumulate buffer and the biases of the kernel.
*
- * @param[in, out] accum The accumulate tensor to convert. Data type supported: QS8/QS16/F32
+ * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32
* @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
*/
void configure(ITensor *accum, const ITensor *biases);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel
*
- * @param[in] accum The accumulate tensor to convert. Data type supported: QS8/QS16/F32
+ * @param[in] accum The accumulate tensor to convert. Data type supported: F32
* @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
*
* @return a status
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index 5e4f8b72ff..1a235933dc 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -59,7 +59,7 @@ public:
*
* @note The input and output tensor must have the same dimensions
*
- * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32
+ * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32
* @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
* @param[in] beta Weight of matrix C
*/
@@ -71,7 +71,7 @@ public:
private:
/** Common signature for all the matrix addition functions
*
- * @param[in] input An input tensor. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input An input tensor. Data types supported: F16/F32
* @param[out] output The output tensor. Data type supported: same as @p input
* @param[in] window Region on which to execute the kernel.
* @param[in] beta Weight of matrix C
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index d54522c678..6ee958205e 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -58,7 +58,7 @@ public:
* @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
* These two kernels change the layout of the original matrices to be more cache-friendly.
*
- * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
* @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
* If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
@@ -69,7 +69,7 @@ public:
void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
*
- * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
+ * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
* @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
* If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
* @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index fcdd8dd93c..b7fbfcfcd2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -74,13 +74,13 @@ public:
}
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: same as @p input.
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor info. Data type supported: same as @p input.
*
* @return a status
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 5aa803f4fd..d455fd98b3 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -77,7 +77,7 @@ public:
/** Set the input and output of the kernel.
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
* Note: QASYMM8 works only for has_bias = false
* @param[out] output The output tensor. Data types supported: Same as @p input
* @param[in] kernel_dims The kernel dimensions (width and height).
@@ -92,7 +92,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
*
* @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+ * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
* Note: QASYMM8 works only for has_bias = false
* @param[in] output The output tensor. Data types supported: Same as @p input
* @param[in] kernel_dims The kernel dimensions (width and height).
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index 6ae7b73423..92086437a6 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -54,7 +54,7 @@ public:
/** Set the input and output tensors.
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
- * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+ * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32.
* @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
* Data type supported: same as @p input
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -64,7 +64,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
- * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+ * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32.
* @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
* Data type supported: same as @p input
* @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -92,18 +92,6 @@ private:
template <DataType dt, unsigned int dim, bool do_2D_norm>
void normalize_float(const Window &window);
- /** Function to perform normalization for fixed-point values depending on
- * the given template dimension. The second template parameter specifies
- * whether the normalization has to be 1D or 2D.
- *
- * @note Only supported normalizations are:
- * - 1D over X or Z
- * - 2D over X and Y
- *
- * @param[in] window Region on which to execute the kernel.
- */
- template <DataType dt, unsigned int dim, bool do_2D_norm>
- void normalize_fixed_point(const Window &window);
/** Common signature for all the specialised normalization functions
*
* @param[in] window Region on which to execute the kernel.
diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
index 68bbdcb3cb..b56faa8514 100644
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
@@ -58,7 +58,7 @@ public:
*
* @note Supported permutation vectors : [2, 0, 1], [1, 2, 0]
*
- * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output The output tensor. Data types supported: Same as @p input
* @param[in] perm Permutation vector
*/
@@ -67,7 +67,7 @@ public:
*
* @note Supported permutation vectors : [2, 0, 1], [1, 2, 0]
*
- * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output The output tensor. Data types supported: Same as @p input
* @param[in] perm Permutation vector
*
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 8c245569a5..41ea91495f 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -55,11 +55,10 @@ public:
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
- * For QS8/QS16 scale = 1 is the only supported value.
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
- * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy.
@@ -70,11 +69,10 @@ public:
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
- * For QS8/QS16 scale = 1 is the only supported value.
*
- * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
- * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
- * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+ * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+ * @param[in] input2 An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
* @param[in] overflow_policy Overflow policy.
@@ -96,15 +94,6 @@ private:
* @param[out] output_ptr Pointer to the output tensor.
*/
using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
- /** Common signature for all the specialised multiplication functions with fixed-point values
- *
- * @param[in] input1_ptr Pointer to the first input tensor.
- * @param[in] input2_ptr Pointer to the second input tensor.
- * @param[in] scale Scaling factor.
- * @param[in] fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
- * @param[out] output_ptr Pointer to the output tensor.
- */
- using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
/** Common signature for all the specialised multiplication functions with float scaling factor
*
* @param[in] input1_ptr Pointer to the first input tensor.
@@ -115,7 +104,6 @@ private:
MulFunctionFloat *_func_float;
MulFunctionInt *_func_int;
- MulFunctionQInt *_func_q_int;
private:
const ITensor *_input1;
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 4140ccf1ed..6c4c1db289 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -52,18 +52,18 @@ public:
~NEPoolingLayerKernel() = default;
/** Set the input and output tensors.
*
- * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+ * @note F16 are supported for pool sizes 2 and 3 only
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
* @param[out] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
*
- * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+ * @note F16 are supported for pool sizes 2 and 3 only
*
- * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
* @param[in] output Destination tensor. Data types supported: Same as @p input.
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*
@@ -90,13 +90,6 @@ private:
*/
template <PoolingType pooling_type, bool exclude_padding = false>
void pooling2_f16_nchw(const Window &window_input, const Window &window);
- /** Function to perform 2x2 pooling for 8bit fixed point.
- *
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
- */
- template <PoolingType pooling_type>
- void pooling2_q8_nchw(const Window &window_input, const Window &window);
/** Function to perform 2x2 pooling for 8bit asymmetric fixed point.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -104,13 +97,6 @@ private:
*/
template <PoolingType pooling_type, bool exclude_padding = false>
void pooling2_qasymm8_nchw(const Window &window_input, const Window &window);
- /** Function to perform 2x2 pooling for 16bit fixed point.
- *
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
- */
- template <PoolingType pooling_type>
- void pooling2_q16_nchw(const Window &window_input, const Window &window);
/** Function to perform 3x3 pooling.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -125,13 +111,6 @@ private:
*/
template <PoolingType pooling_type, bool exclude_padding = false>
void pooling3_f16_nchw(const Window &window_input, const Window &window);
- /** Function to perform 3x3 pooling for 8bit fixed point.
- *
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
- */
- template <PoolingType pooling_type>
- void pooling3_q8_nchw(const Window &window_input, const Window &window);
/** Function to perform 3x3 pooling for 8bit quantized fixed point.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -139,13 +118,6 @@ private:
*/
template <PoolingType pooling_type, bool exclude_padding = false>
void pooling3_qasymm8_nchw(const Window &window_input, const Window &window);
- /** Function to perform 3x3 pooling for 16bit fixed point.
- *
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
- */
- template <PoolingType pooling_type>
- void pooling3_q16_nchw(const Window &window_input, const Window &window);
/** Function to perform 7x7 pooling.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -153,13 +125,6 @@ private:
*/
template <PoolingType pooling_type, bool exclude_padding = false>
void pooling7_f32_nchw(const Window &window_input, const Window &window);
- /** Function to perform MxN pooling for 8bit fixed point.
- *
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
- */
- template <PoolingType pooling_type>
- void poolingMxN_q8_nchw(const Window &window_input, const Window &window);
/** Function to perform MxN pooling for 8-bit quantized.
*
* @param[in] window_input Input region on which to execute the kernel.
@@ -174,13 +139,6 @@ private:
*/
template <PoolingType pooling_type, bool exclude_padding = false>
void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window);
- /** Function to perform MxN pooling for 16bit fixed point.
- *
- * @param[in] window_input Input region on which to execute the kernel.
- * @param[in] window Output region on which to execute the kernel.
- */
- template <PoolingType pooling_type>
- void poolingMxN_q16_nchw(const Window &window_input, const Window &window);
/** Function to perform MxN pooling for 16-bit floating point values.
*
* @param[in] window_input Input region on which to execute the kernel.
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index 0a3fc44881..08b4e11189 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -40,7 +40,7 @@ public:
}
/** Set the input and output of the kernel
*
- * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32
+ * @param[in] input Source tensor. Data type supported: U8/S8/U16/S16/QASYMM8/U32/S32/F16/F32
* @param[out] output Destination tensor. Data type supported: Same as @p input
*/
void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index c30a4cd23d..25c3196e34 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -43,13 +43,13 @@ public:
NELogits1DMaxKernel();
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
* @param[out] output Destination tensor. Data types supported: same as @p input
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
*
- * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
* @param[in] output Destination tensor. Data types supported: same as @p input
*
* @return a status
@@ -90,7 +90,7 @@ public:
~NELogits1DSoftmaxKernel() = default;
/** Set the input and output tensors.
*
- * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+ * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32.
* @param[in] max Max values tensor. Same shape as input with dimension 0 set to 1.
* Data types supported: same as @p input.
* @param[out] output Destination tensor. Data types supported: same as @p input.
@@ -101,7 +101,7 @@ public:
void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
/** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
*
- * @param[in] input Source tensor info. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32.
* @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1.
* Data types supported: same as @p input.
* @param[in] output Destination tensor info. Data types supported: same as @p input.
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
index dc7ef8ff7a..76823acfa1 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h
@@ -57,13 +57,13 @@ public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: Same as @p input
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor. Data type supported: Same as @p input
*
* @return a status
@@ -76,7 +76,7 @@ public:
private:
/** Common signature for all the transpose functions
*
- * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in] input An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
* @param[out] output The output tensor. Data type supported: same as @p input
* @param[in] window Region on which to execute the kernel.
*/
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index 1a7525bfc7..21f36f6c2b 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -75,7 +75,7 @@ public:
/** Set the input and output of the kernel.
*
* @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F32
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32
* @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -85,7 +85,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
*
* @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F16/F32
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32
* @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index fee206638b..fd0c0f0c34 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -45,13 +45,11 @@ inline float32x4x3_t load_matrix_row(const float *ptr)
}
template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
-
const float32x4x3_t vtop =
{
{
@@ -108,9 +106,9 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
}
template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
{
- float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -118,9 +116,9 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
}
template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
{
- float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
return out;
}
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 908fa13876..d56fd44700 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -55,29 +55,6 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
return r;
}
-/** Loads a 3x3 matrix as a row (qint8_t).
- *
- * @param[in] ptr Pointer to a qint8 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-inline qint8x8x3_t load_matrix_row(const qint8_t *ptr, int weights_offset = 0)
-{
- ARM_COMPUTE_UNUSED(weights_offset);
- /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
- r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
- const qint8x8x3_t r =
- {
- {
- vld1_dup_qs8(ptr),
- vld1_dup_qs8(1 + ptr),
- vld1_dup_qs8(2 + ptr)
- }
- };
- return r;
-}
-
/** Loads a 3x3 matrix as a row (uint8_t).
*
* @param[in] ptr Pointer to a uint8_t 3x3 matrix.
@@ -104,27 +81,25 @@ inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0)
/** Perform a convolve3x3 on float32.
*
- * @param[in] in_top Pointer to the first row of the input.
- * @param[in] in_mid Pointer to the second row of the input.
- * @param[in] in_low Pointer to the third row of the input.
- * @param[in] m0 First row of the filter.
- * @param[in] m1 Second row of the filter.
- * @param[in] m2 Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset (Optional) Input quantization offset.
+ * @param[in] in_top Pointer to the first row of the input.
+ * @param[in] in_mid Pointer to the second row of the input.
+ * @param[in] in_low Pointer to the third row of the input.
+ * @param[in] m0 First row of the filter.
+ * @param[in] m1 Second row of the filter.
+ * @param[in] m2 Third row of the filter.
+ * @param[in] input_offset (Optional) Input quantization offset.
*
*/
template <unsigned int stridex>
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low,
const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- int fixed_point_position, int input_offset = 0);
+ int input_offset = 0);
template <>
inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low,
const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- int fixed_point_position, int input_offset)
+ int input_offset)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
ARM_COMPUTE_UNUSED(input_offset);
const float32x4x3_t vtop =
@@ -185,11 +160,11 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
template <>
inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low,
const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- int fixed_point_position, int input_offset)
+ int input_offset)
{
ARM_COMPUTE_UNUSED(input_offset);
- float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+ float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -199,145 +174,35 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
template <>
inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low,
const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- int fixed_point_position, int input_offset)
+ int input_offset)
{
ARM_COMPUTE_UNUSED(input_offset);
- float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+ float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
return out;
}
-/** Perform a convolve3x3 on qint16.
- *
- * @param[in] in_top Pointer to the first row of the input.
- * @param[in] in_mid Pointer to the second row of the input.
- * @param[in] in_low Pointer to the third row of the input.
- * @param[in] m0 First row of the filter.
- * @param[in] m1 Second row of the filter.
- * @param[in] m2 Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-template <unsigned int stridex>
-qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
- const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
- int fixed_point_position, int input_offset = 0);
-
-template <>
-inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
- const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
- int fixed_point_position, int input_offset)
-{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- ARM_COMPUTE_UNUSED(input_offset);
-
- const qint8x8x3_t vtop =
- {
- {
- vld1_qs8(in_top),
- vld1_qs8(in_top + 8),
- vld1_qs8(in_top + 16)
- }
- };
- const qint8x8x3_t vmid =
- {
- {
- vld1_qs8(in_mid),
- vld1_qs8(in_mid + 8),
- vld1_qs8(in_mid + 16)
- }
- };
- const qint8x8x3_t vlow =
- {
- {
- vld1_qs8(in_low),
- vld1_qs8(in_low + 8),
- vld1_qs8(in_low + 16)
- }
- };
- qint16x8x2_t out =
- {
- {
- vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
- vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
- }
- };
- out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
- out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
- out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
- return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
- const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
- int fixed_point_position, int input_offset)
-{
- ARM_COMPUTE_UNUSED(input_offset);
-
- qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
- return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
- const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
- int fixed_point_position, int input_offset)
-{
- ARM_COMPUTE_UNUSED(input_offset);
-
- qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
- out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
- return out;
-}
-
/** Perform a convolve3x3 on uint8_t
*
- * @param[in] in_top Pointer to the first row of the input.
- * @param[in] in_mid Pointer to the second row of the input.
- * @param[in] in_low Pointer to the third row of the input.
- * @param[in] m0 First row of the filter.
- * @param[in] m1 Second row of the filter.
- * @param[in] m2 Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset (Optional) Input quantization offset.
+ * @param[in] in_top Pointer to the first row of the input.
+ * @param[in] in_mid Pointer to the second row of the input.
+ * @param[in] in_low Pointer to the third row of the input.
+ * @param[in] m0 First row of the filter.
+ * @param[in] m1 Second row of the filter.
+ * @param[in] m2 Third row of the filter.
+ * @param[in] input_offset (Optional) Input quantization offset.
*
*/
template <unsigned int stridex>
int32x4x2_t convolve_3x3(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- int fixed_point_position, int input_offset);
+ int input_offset);
template <>
inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- int fixed_point_position, int input_offset)
+ int input_offset)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
-
const int32x4_t v_input_offset = vdupq_n_s32(input_offset);
const uint8x8x2_t vtop =
@@ -427,11 +292,9 @@ inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid,
template <>
inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- int fixed_point_position, int input_offset)
+ int input_offset)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
-
- int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+ int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3);
@@ -441,10 +304,9 @@ inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid,
template <>
inline int32x4x2_t convolve_3x3<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- int fixed_point_position, int input_offset)
+ int input_offset)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+ int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1);
return out;
}
@@ -477,34 +339,6 @@ inline void store_results<3>(float *buffer, const float32x4x2_t &values)
vst1_f32(buffer, vget_low_f32(values.val[0]));
}
-/** Stores a qint16_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(qint16_t *buffer, const qint16x8x2_t &values);
-
-template <>
-inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
-{
- vst1q_qs16(buffer, values.val[0]);
- vst1q_qs16(buffer + 8, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
-{
- vst1q_qs16(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
-{
- vst1_qs16(buffer, vget_low_s16(values.val[0]));
-}
-
/** Stores a uint32_t array into a memory location.
*
* @param[in] buffer Pointer to the memory location where the values will be stored.
@@ -557,25 +391,20 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr)
/** Perform a convolve3x3 on float16.
*
- * @param[in] in_top Pointer to the first row of the input.
- * @param[in] in_mid Pointer to the second row of the input.
- * @param[in] in_low Pointer to the third row of the input.
- * @param[in] m0 First row of the filter.
- * @param[in] m1 Second row of the filter.
- * @param[in] m2 Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
+ * @param[in] in_top Pointer to the first row of the input.
+ * @param[in] in_mid Pointer to the second row of the input.
+ * @param[in] in_low Pointer to the third row of the input.
+ * @param[in] m0 First row of the filter.
+ * @param[in] m1 Second row of the filter.
+ * @param[in] m2 Third row of the filter.
*
*/
template <unsigned int stridex>
-float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- int fixed_point_position);
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2);
template <>
-inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- int fixed_point_position)
+inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
-
const float16x8x3_t vtop =
{
{
@@ -627,10 +456,9 @@ inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *i
}
template <>
-inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- int fixed_point_position)
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
{
- float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
@@ -638,10 +466,9 @@ inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *i
}
template <>
-inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- int fixed_point_position)
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
{
- float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
return out;
}
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 882e4ec1d0..681e27033e 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -98,12 +98,6 @@ public:
_parent->set_format(format);
return *this;
};
- ITensorInfo &set_fixed_point_position(int fixed_point_position) override
- {
- ARM_COMPUTE_ERROR_ON(_parent == nullptr);
- _parent->set_fixed_point_position(fixed_point_position);
- return *this;
- };
ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override
{
@@ -143,11 +137,6 @@ public:
return _parent->offset_element_in_bytes(_coords);
}
size_t offset_element_in_bytes(const Coordinates &pos) const override;
- int fixed_point_position() const override
- {
- ARM_COMPUTE_ERROR_ON(_parent == nullptr);
- return _parent->fixed_point_position();
- }
size_t element_size() const override
{
ARM_COMPUTE_ERROR_ON(_parent == nullptr);
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index f8cfb35357..1eaf052d8e 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -86,20 +86,18 @@ public:
*
* Can be used for automatic derivation of the shape by the function.
*
- * @param[in] num_channels It indicates the number of channels for each tensor element
- * @param[in] data_type Data type to use for each tensor element
- * @param[in] fixed_point_position (Optional) It specifies the fixed point position when the tensor data type is QS8, QS16 or QS32.
+ * @param[in] num_channels It indicates the number of channels for each tensor element
+ * @param[in] data_type Data type to use for each tensor element
*/
- TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+ TensorInfo(size_t num_channels, DataType data_type);
/** Constructor
*
- * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements.
- * @param[in] num_channels It indicates the number of channels for each tensor element
- * @param[in] data_type Data type to use for each tensor element
- * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+ * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements.
+ * @param[in] num_channels It indicates the number of channels for each tensor element
+ * @param[in] data_type Data type to use for each tensor element
*/
- TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+ TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type);
/** Constructor
*
@@ -146,20 +144,18 @@ public:
*
* Can be used for automatic derivation of the shape by the function.
*
- * @param[in] num_channels Desired number of channels for each tensor element.
- * @param[in] data_type Data type to use for each tensor element.
- * @param[in] fixed_point_position (Optional) Fixed point position when the tensor data type is QS8, QS16 or QS32.
+ * @param[in] num_channels Desired number of channels for each tensor element.
+ * @param[in] data_type Data type to use for each tensor element.
*/
- void init(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+ void init(size_t num_channels, DataType data_type);
/** Initialize the metadata structure with the given parameters
*
- * @param[in] tensor_shape Size for each dimension of the tensor in number of elements.
- * @param[in] num_channels Desired number of channels for each tensor element.
- * @param[in] data_type Data type to use for each tensor element.
- * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+ * @param[in] tensor_shape Size for each dimension of the tensor in number of elements.
+ * @param[in] num_channels Desired number of channels for each tensor element.
+ * @param[in] data_type Data type to use for each tensor element.
*/
- void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+ void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type);
/** Initialize the metadata structure with the given parameters
*
@@ -169,10 +165,9 @@ public:
* @param[in] strides_in_bytes Stride in bytes for accessing each dimension of the tensor.
* @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
* @param[in] total_size_in_bytes Size in bytes of the memory allocation (including the offset to the first element).
- * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
*/
void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
- size_t total_size_in_bytes, int fixed_point_position = 0);
+ size_t total_size_in_bytes);
/** Initialize the metadata structure for the given HOG's metadata
*
* @param[in] hog_info HOG's metadata used to allocate normalized HOG space
@@ -190,19 +185,18 @@ public:
* @return Total allocation size including padding in bytes.
*/
size_t init_auto_padding(const TensorShape &tensor_shape, Format format);
- /** Initialize the metadata structure for the given tensor shape, number of channels,
- * data type and fixed point position. (Padding is automatically calculated)
+ /** Initialize the metadata structure for the given tensor shape, number of channels and
+ * data type. (Padding is automatically calculated)
*
* @note The padding used by this method is really conservative so that the tensor can be used for most functions.
*
- * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements
- * @param[in] num_channels It indicates the number of channels for each tensor element
- * @param[in] data_type Data type to use for each tensor element
- * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+ * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements
+ * @param[in] num_channels It indicates the number of channels for each tensor element
+ * @param[in] data_type Data type to use for each tensor element
*
* @return Total allocation size including padding in bytes.
*/
- size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+ size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type);
/** Initialize the metadata structure for the given HOG's metadata
*
* @note init_auto_padding will be used for the tensor initialization.
@@ -221,7 +215,6 @@ public:
ITensorInfo &set_num_channels(int num_channels) override;
ITensorInfo &set_format(Format format) override;
ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
- ITensorInfo &set_fixed_point_position(int fixed_point_position) override;
ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override;
ITensorInfo &set_data_layout(const DataLayout &data_layout) override;
ITensorInfo &reset_padding() override;
@@ -244,10 +237,6 @@ public:
return _offset_first_element_in_bytes;
}
size_t offset_element_in_bytes(const Coordinates &pos) const override;
- int fixed_point_position() const override
- {
- return _fixed_point_position;
- }
size_t element_size() const override
{
return data_size_from_type(_data_type) * _num_channels;
@@ -318,7 +307,6 @@ private:
std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding);
size_t _total_size;
- int _fixed_point_position;
size_t _offset_first_element_in_bytes;
Strides _strides_in_bytes;
size_t _num_channels;
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index da28e131de..89fd4b8bb4 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -74,11 +74,9 @@ enum class DataType
UNKNOWN, /**< Unknown data type */
U8, /**< unsigned 8-bit number */
S8, /**< signed 8-bit number */
- QS8, /**< quantized, symmetric fixed-point 8-bit number */
QASYMM8, /**< quantized, asymmetric fixed-point 8-bit number */
U16, /**< unsigned 16-bit number */
S16, /**< signed 16-bit number */
- QS16, /**< quantized, symmetric fixed-point 16-bit number */
U32, /**< unsigned 32-bit number */
S32, /**< signed 32-bit number */
QS32, /**< quantized, symmetric fixed-point 32-bit number */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 060d5904d4..cfebfa1506 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -110,13 +110,11 @@ inline size_t data_size_from_type(DataType data_type)
{
case DataType::U8:
case DataType::S8:
- case DataType::QS8:
case DataType::QASYMM8:
return 1;
case DataType::U16:
case DataType::S16:
case DataType::F16:
- case DataType::QS16:
return 2;
case DataType::F32:
case DataType::U32:
@@ -185,12 +183,10 @@ inline size_t element_size_from_data_type(DataType dt)
{
case DataType::S8:
case DataType::U8:
- case DataType::QS8:
case DataType::QASYMM8:
return 1;
case DataType::U16:
case DataType::S16:
- case DataType::QS16:
case DataType::F16:
return 2;
case DataType::U32:
@@ -522,14 +518,10 @@ inline DataType get_promoted_data_type(DataType dt)
return DataType::U16;
case DataType::S8:
return DataType::S16;
- case DataType::QS8:
- return DataType::QS16;
case DataType::U16:
return DataType::U32;
case DataType::S16:
return DataType::S32;
- case DataType::QS16:
- return DataType::QS32;
case DataType::QASYMM8:
case DataType::F16:
case DataType::U32:
@@ -1018,29 +1010,7 @@ inline bool is_data_type_quantized(DataType dt)
{
switch(dt)
{
- case DataType::QS8:
case DataType::QASYMM8:
- case DataType::QS16:
- case DataType::QS32:
- return true;
- default:
- return false;
- }
-}
-
-/** Check if a given data type is of fixed point type
- *
- * @param[in] dt Input data type.
- *
- * @return True if data type is of fixed point type, else false.
- */
-inline bool is_data_type_fixed_point(DataType dt)
-{
- switch(dt)
- {
- case DataType::QS8:
- case DataType::QS16:
- case DataType::QS32:
return true;
default:
return false;
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index 4ef94f2c6d..1646ebe719 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -545,71 +545,6 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
-/** Return an error if the passed tensor infos have different fixed point data types or different fixed point positions
- *
- * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file Name of the file where the error occurred.
- * @param[in] line Line on which the error occurred.
- * @param[in] tensor_info_1 The first tensor info to be compared.
- * @param[in] tensor_info_2 The second tensor info to be compared.
- * @param[in] tensor_infos (Optional) Further allowed tensor infos.
- *
- * @return Status
- */
-template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
- const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
-{
- DataType &&first_data_type = tensor_info_1->data_type();
- const int first_fixed_point_position = tensor_info_1->fixed_point_position();
-
- if(!is_data_type_fixed_point(first_data_type))
- {
- return arm_compute::Status{};
- }
-
- const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
- {
- return tensor_info->data_type() != first_data_type;
- }),
- function, file, line, "Tensors have different fixed point data types");
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
- {
- return tensor_info->fixed_point_position() != first_fixed_point_position;
- }),
- function, file, line, "Tensors have different fixed point positions");
-
- return arm_compute::Status{};
-}
-/** Return an error if the passed tensor have different fixed point data types or different fixed point positions
- *
- * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file Name of the file where the error occurred.
- * @param[in] line Line on which the error occurred.
- * @param[in] tensor_1 The first tensor to be compared.
- * @param[in] tensor_2 The second tensor to be compared.
- * @param[in] tensors (Optional) Further allowed tensors.
- *
- * @return Status
- */
-template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
- const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(function, file, line, tensor_1->info(), tensor_2->info(),
- detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
- return arm_compute::Status{};
-}
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) \
- ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
-#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(...) \
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
-
/** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info
*
* @note: If the first tensor info doesn't have asymmetric quantized data type, the function returns without throwing an error
@@ -976,96 +911,5 @@ arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function
ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
-
-/** Return an error if the input fixed-point positions are different.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file Name of the file where the error occurred.
- * @param[in] line Line on which the error occurred.
- * @param[in] tensor_info_1 The first tensor info to be compared.
- * @param[in] tensor_info_2 The second tensor info to be compared.
- * @param[in] tensor_infos (Optional) Further allowed tensor infos.
- *
- * @return Status
- */
-template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
- const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
-{
- const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_info_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_info_array.begin(), tensor_info_array.end(), [&](const ITensorInfo * tensor_info)
- {
- return tensor_info->fixed_point_position() != tensor_info_1->fixed_point_position();
- }),
- function, file, line, "Tensors have different fixed-point positions");
- return arm_compute::Status{};
-}
-/** Return an error if the input fixed-point positions are different.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file Name of the file where the error occurred.
- * @param[in] line Line on which the error occurred.
- * @param[in] tensor_1 The first tensor to be compared.
- * @param[in] tensor_2 The second tensor to be compared.
- * @param[in] tensors (Optional) Further allowed tensors.
- *
- * @return Status
- */
-template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
- const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(function, file, line, tensor_1->info(), tensor_2->info(),
- detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
- return arm_compute::Status{};
-}
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \
- ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__))
-#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__))
-
-/** Return an error if the fixed-point value is not representable in the specified Q format.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file Name of the file where the error occurred.
- * @param[in] line Line on which the error occurred.
- * @param[in] value The floating point value to be checked.
- * @param[in] tensor_info Input tensor info that has information on data type and fixed-point position.
- *
- * @return Status
- */
-inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
- float value, const ITensorInfo *tensor_info)
-{
- const int fixed_point_position = tensor_info->fixed_point_position();
- const DataType dt = tensor_info->data_type();
- const unsigned int q_max_range = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1);
- const float max_range = q_max_range / (static_cast<float>(1 << fixed_point_position));
-
- ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(value > max_range, function, file, line,
- "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position);
- return arm_compute::Status{};
-}
-/** Return an error an error if the fixed-point value is not representable in the specified Q format.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file Name of the file where the error occurred.
- * @param[in] line Line on which the error occurred.
- * @param[in] value The floating point value to be checked.
- * @param[in] tensor Input tensor that has information on data type and fixed-point position.
- *
- * @return Status
- */
-inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
- float value, const ITensor *tensor)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(function, file, line, value, tensor->info()));
- return arm_compute::Status{};
-}
-#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \
- ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
-#define ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \
- ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
}
#endif /* __ARM_COMPUTE_VALIDATE_H__*/