From 7485d5a62685cb745ab50e970adb722cb71557ac Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Wed, 4 Jul 2018 09:34:00 +0100 Subject: COMPMID-970 : Remove QS8 / QS16 support Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../core/NEON/kernels/NEActivationLayerKernel.h | 17 +- .../core/NEON/kernels/NEArithmeticAdditionKernel.h | 20 +- .../NEON/kernels/NEArithmeticSubtractionKernel.h | 20 +- .../NEON/kernels/NEBatchNormalizationLayerKernel.h | 21 +- arm_compute/core/NEON/kernels/NECol2ImKernel.h | 4 +- .../kernels/NEConvertFullyConnectedWeightsKernel.h | 4 +- .../NEON/kernels/NEDepthConcatenateLayerKernel.h | 2 +- .../core/NEON/kernels/NEDepthConvertLayerKernel.h | 13 +- .../NEON/kernels/NEDirectConvolutionLayerKernel.h | 8 +- .../NEDirectConvolutionLayerOutputStageKernel.h | 8 +- arm_compute/core/NEON/kernels/NEFillBorderKernel.h | 2 +- .../core/NEON/kernels/NEFillInnerBorderKernel.h | 2 +- .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h | 6 +- .../kernels/NEGEMMMatrixAccumulateBiasesKernel.h | 4 +- .../core/NEON/kernels/NEGEMMMatrixAdditionKernel.h | 4 +- .../core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h | 4 +- .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h | 4 +- arm_compute/core/NEON/kernels/NEIm2ColKernel.h | 4 +- .../core/NEON/kernels/NENormalizationLayerKernel.h | 16 +- arm_compute/core/NEON/kernels/NEPermuteKernel.h | 4 +- .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 24 +- .../core/NEON/kernels/NEPoolingLayerKernel.h | 50 +---- .../core/NEON/kernels/NEReshapeLayerKernel.h | 2 +- .../core/NEON/kernels/NESoftmaxLayerKernel.h | 8 +- arm_compute/core/NEON/kernels/NETransposeKernel.h | 6 +- .../core/NEON/kernels/NEWeightsReshapeKernel.h | 4 +- .../NEON/kernels/detail/NEDirectConvolution3x3.h | 14 +- .../kernels/detail/NEDirectConvolutionDetail.h | 249 ++++----------------- 28 files changed, 121 insertions(+), 403 deletions(-) (limited to 'arm_compute/core/NEON/kernels') diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 06a0a01782..0290e32085 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -24,7 +24,6 @@ #ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ #define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ -#include "arm_compute/core/FixedPoint.h" #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/QAsymm8.h" @@ -59,7 +58,7 @@ public: * @note If the output tensor is a nullptr, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * of the activation function. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] activation_info Activation layer information. */ @@ -67,7 +66,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * of the activation function. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data type supported: same as @p input * @param[in] act_info Activation layer information. * @@ -99,24 +98,12 @@ private: template typename std::enable_if::value, void>::type activation(const Window &window); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template - typename std::enable_if::value, void>::type activation(const Window &window); /** Function to apply an activation function on a tensor. * * @param[in] window Region on which to execute the kernel */ template typename std::enable_if::value, void>::type activation(const Window &window); - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template - typename std::enable_if::value, void>::type activation(const Window &window); private: ITensor *_input; diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index 155e792f5d..8cf21eae9d 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -57,26 +57,24 @@ public: * Valid configurations (Input1,Input2) -> Output : * * - (U8,U8) -> U8 - * - (QS8,QS8) -> QS8 * - (U8,U8) -> S16 * - (S16,U8) -> S16 * - (U8,S16) -> S16 * - (S16,S16) -> S16 - * - (QS16,QS16) -> QS16 * - (F16,F16) -> F16 * - (F32,F32) -> F32 * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] policy Overflow policy. * * @return a status @@ -90,9 +88,9 @@ public: private: /** Common signature for all the specialised add functions * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] window Region on which to execute the kernel. */ using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h index 73ecfcfeb5..3e93922b65 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -57,26 +57,24 @@ public: * Valid configurations (Input1,Input2) -> Output : * * - (U8,U8) -> U8 - * - (QS8,QS8) -> QS8 * - (U8,U8) -> S16 * - (S16,U8) -> S16 * - (U8,S16) -> S16 * - (S16,S16) -> S16 - * - (QS16,QS16) -> QS16 * - (F16,F16) -> F16 * - (F32,F32) -> F32 * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel * - * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32 + * @param[in] output Output tensor. Data types supported: U8/S16/F16/F32 * @param[in] policy Policy to use to handle overflow. * * @return a status @@ -89,9 +87,9 @@ public: private: /** Common signature for all the specialised sub functions * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32. * @param[in] window Region on which to execute the kernel. */ using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 2d33f87dfa..2a540c151b 100644 --- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -57,7 +57,7 @@ public: * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input @@ -72,7 +72,7 @@ public: * * @param[in] input Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. - * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * The rest are optional and used for representing batches. Data types supported: F16/F32. * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input @@ -96,22 +96,7 @@ private: void configure_non_fused(); /** Configure execution function in case of fused activation **/ void configure_fused(); - /** Template function to run batch normalization on 8-bit fixed point - * - * @tparam fused_activation Boolean that flags if its a fused activation or not - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void batch_normalization_qs8(const Window &window); - /** Template function to run batch normalization on 16-bit fixed point - * - * @tparam fused_activation Boolean that flags if its a fused activation or not - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void batch_normalization_qs16(const Window &window); + /** Template function to run batch normalization on fp16 * * @tparam fused_activation Boolean that flags if its a fused activation or not diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h index 9fb493cc4f..f02858e7d9 100644 --- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h +++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h @@ -72,7 +72,7 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. @@ -80,7 +80,7 @@ public: void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims); /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h index 65ce764246..d5c9e3bbe9 100644 --- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h +++ b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h @@ -59,7 +59,7 @@ public: ~NEConvertFullyConnectedWeightsKernel() = default; /** Set the input and output tensor. * - * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32. + * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32. * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format. * @param[in] data_layout The data layout the weights have been trained in. @@ -67,7 +67,7 @@ public: void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel * - * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32. + * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32. * @param[in] output The converted weights tensor info. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format. * @param[in] data_layout The data layout the weights have been trained in. diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h index 67ef5293b7..12a5051ef8 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h @@ -55,7 +55,7 @@ public: ~NEDepthConcatenateLayerKernel() = default; /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] input Input tensor. Data types supported: F16/F32. * @param[in] depth_offset The offset on the Z axis. * @param[in,out] output Output tensor. Data types supported: Same as @p input. * diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h index 50536f2b47..77bb0413ca 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h @@ -55,19 +55,12 @@ public: * * Valid conversions Input -> Output : * - * - QS8 -> QS8, F32 * - U8 -> U16, S16, S32 * - U16 -> U8, U32 * - S16 -> U8, S32 - * - QS16 -> QS16, F32 - * - F32 -> QS8 * - * @warning In case of in-place fixed point position conversion make sure that configure has been called - * before the updated tensor is used in other functions, as the TensorInfo of the tensor will be - * altered. In-place is only supported for QS8 -> QS8, QS16 -> QS16. - * - * @param[in, out] input The input tensor to convert (Written in case of in-place computation). Data types supported: U8/QS8/U16/S16/F32. - * @param[out] output The output tensor. Can be null in case of in-place computation. Data types supported: U8/QS8/U16/S16/U32/S32/F32. + * @param[in, out] input The input tensor to convert (Written in case of in-place computation). Data types supported: U8/U16/S16. + * @param[out] output The output tensor. Can be null in case of in-place computation. Data types supported: U8/U16/S16/U32/S32/F32. * @param[in] policy Conversion policy. * @param[in] shift (Optional) Value for down/up conversions. Must be 0 <= shift < 8. * In case of fixed point position conversion, it specifies the new fixed point position, if operation is in-place. @@ -82,8 +75,6 @@ private: ITensor *_output; ConvertPolicy _policy; uint32_t _shift; - int _fixed_point_position_input; - int _fixed_point_position_output; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h index f859f97dae..589725ab01 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -57,24 +57,24 @@ public: * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. * @param[out] output Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32 + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. * @param[in] output Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32 + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS32/F16/F32 * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h index 77711d7ecd..7fd1d70374 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h @@ -55,10 +55,10 @@ public: /** Set the accumulate buffer and the biases of the kernel. * * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS16/QS32/F16/F32 + * Data type supported: QS32/F16/F32 * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Data type supported: QS8/QS16/F16/F32 + * Data type supported: F16/F32 * @param[in] result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add * @param[in] result_shift (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication * @param[in] result_offset_after_shift (Optional)Offset to be applied to result before converting it back to QASYMM8 @@ -68,10 +68,10 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel * * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS16/QS32/F16/F32 + * Data type supported: QS32/F16/F32 * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Data type supported: QS8/QS16/F16/F32 + * Data type supported: F16/F32 * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr); diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h index dd19b8f35a..cff6b4ea2d 100644 --- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h +++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h @@ -57,7 +57,7 @@ public: * * @note This kernel fills the borders within the XY-planes. * - * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QS8/QASYMM8/QS16/S16/S32/F32. + * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QASYMM8/S16/S32/F32. * @param[in] border_size Size of the border to fill in elements. * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h index 545a265dc2..2b6c7af72a 100644 --- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h +++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h @@ -57,7 +57,7 @@ public: * * @note This kernel fills the borders within the XY-planes. * - * @param[in,out] input Tensor to process. Data types supported: U8/QS8/S16/S32/F32. + * @param[in,out] input Tensor to process. Data types supported: U8/S16/S32/F32. * @param[in] border_size Size of the border to fill in elements. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index 79504fd4da..5c0104d138 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -60,13 +60,13 @@ public: NEGEMMInterleave4x4Kernel(); /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * * @return a status @@ -79,7 +79,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h index e48a9a77e4..419a9f9150 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h @@ -51,13 +51,13 @@ public: ~NEGEMMMatrixAccumulateBiasesKernel() = default; /** Set the accumulate buffer and the biases of the kernel. * - * @param[in, out] accum The accumulate tensor to convert. Data type supported: QS8/QS16/F32 + * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32 * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input */ void configure(ITensor *accum, const ITensor *biases); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel * - * @param[in] accum The accumulate tensor to convert. Data type supported: QS8/QS16/F32 + * @param[in] accum The accumulate tensor to convert. Data type supported: F32 * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h index 5e4f8b72ff..1a235933dc 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h @@ -59,7 +59,7 @@ public: * * @note The input and output tensor must have the same dimensions * - * @param[in] input Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32 + * @param[in] input Input tensor (Matrix C). Data types supported: F16/F32 * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input. * @param[in] beta Weight of matrix C */ @@ -71,7 +71,7 @@ public: private: /** Common signature for all the matrix addition functions * - * @param[in] input An input tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input An input tensor. Data types supported: F16/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. * @param[in] beta Weight of matrix C diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h index d54522c678..6ee958205e 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h @@ -58,7 +58,7 @@ public: * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel * These two kernels change the layout of the original matrices to be more cache-friendly. * - * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. @@ -69,7 +69,7 @@ public: void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel * - * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h index fcdd8dd93c..b7fbfcfcd2 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h @@ -74,13 +74,13 @@ public: } /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor info. Data type supported: same as @p input. * * @return a status diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h index 5aa803f4fd..d455fd98b3 100644 --- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h @@ -77,7 +77,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * Note: QASYMM8 works only for has_bias = false * @param[out] output The output tensor. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). @@ -92,7 +92,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 * Note: QASYMM8 works only for has_bias = false * @param[in] output The output tensor. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h index 6ae7b73423..92086437a6 100644 --- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h @@ -54,7 +54,7 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], * Data type supported: same as @p input * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input @@ -64,7 +64,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], * Data type supported: same as @p input * @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input @@ -92,18 +92,6 @@ private: template void normalize_float(const Window &window); - /** Function to perform normalization for fixed-point values depending on - * the given template dimension. The second template parameter specifies - * whether the normalization has to be 1D or 2D. - * - * @note Only supported normalizations are: - * - 1D over X or Z - * - 2D over X and Y - * - * @param[in] window Region on which to execute the kernel. - */ - template - void normalize_fixed_point(const Window &window); /** Common signature for all the specialised normalization functions * * @param[in] window Region on which to execute the kernel. diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h index 68bbdcb3cb..b56faa8514 100644 --- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h +++ b/arm_compute/core/NEON/kernels/NEPermuteKernel.h @@ -58,7 +58,7 @@ public: * * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0] * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector */ @@ -67,7 +67,7 @@ public: * * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0] * - * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output The output tensor. Data types supported: Same as @p input * @param[in] perm Permutation vector * diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index 8c245569a5..41ea91495f 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -55,11 +55,10 @@ public: * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * For QS8/QS16 scale = 1 is the only supported value. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. @@ -70,11 +69,10 @@ public: * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * For QS8/QS16 scale = 1 is the only supported value. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. @@ -96,15 +94,6 @@ private: * @param[out] output_ptr Pointer to the output tensor. */ using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale); - /** Common signature for all the specialised multiplication functions with fixed-point values - * - * @param[in] input1_ptr Pointer to the first input tensor. - * @param[in] input2_ptr Pointer to the second input tensor. - * @param[in] scale Scaling factor. - * @param[in] fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number. - * @param[out] output_ptr Pointer to the output tensor. - */ - using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position); /** Common signature for all the specialised multiplication functions with float scaling factor * * @param[in] input1_ptr Pointer to the first input tensor. @@ -115,7 +104,6 @@ private: MulFunctionFloat *_func_float; MulFunctionInt *_func_int; - MulFunctionQInt *_func_q_int; private: const ITensor *_input1; diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h index 4140ccf1ed..6c4c1db289 100644 --- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -52,18 +52,18 @@ public: ~NEPoolingLayerKernel() = default; /** Set the input and output tensors. * - * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only + * @note F16 are supported for pool sizes 2 and 3 only * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel * - * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only + * @note F16 are supported for pool sizes 2 and 3 only * - * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @@ -90,13 +90,6 @@ private: */ template void pooling2_f16_nchw(const Window &window_input, const Window &window); - /** Function to perform 2x2 pooling for 8bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling2_q8_nchw(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for 8bit asymmetric fixed point. * * @param[in] window_input Input region on which to execute the kernel. @@ -104,13 +97,6 @@ private: */ template void pooling2_qasymm8_nchw(const Window &window_input, const Window &window); - /** Function to perform 2x2 pooling for 16bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling2_q16_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling. * * @param[in] window_input Input region on which to execute the kernel. @@ -125,13 +111,6 @@ private: */ template void pooling3_f16_nchw(const Window &window_input, const Window &window); - /** Function to perform 3x3 pooling for 8bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling3_q8_nchw(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 8bit quantized fixed point. * * @param[in] window_input Input region on which to execute the kernel. @@ -139,13 +118,6 @@ private: */ template void pooling3_qasymm8_nchw(const Window &window_input, const Window &window); - /** Function to perform 3x3 pooling for 16bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void pooling3_q16_nchw(const Window &window_input, const Window &window); /** Function to perform 7x7 pooling. * * @param[in] window_input Input region on which to execute the kernel. @@ -153,13 +125,6 @@ private: */ template void pooling7_f32_nchw(const Window &window_input, const Window &window); - /** Function to perform MxN pooling for 8bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void poolingMxN_q8_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 8-bit quantized. * * @param[in] window_input Input region on which to execute the kernel. @@ -174,13 +139,6 @@ private: */ template void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window); - /** Function to perform MxN pooling for 16bit fixed point. - * - * @param[in] window_input Input region on which to execute the kernel. - * @param[in] window Output region on which to execute the kernel. - */ - template - void poolingMxN_q16_nchw(const Window &window_input, const Window &window); /** Function to perform MxN pooling for 16-bit floating point values. * * @param[in] window_input Input region on which to execute the kernel. diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h index 0a3fc44881..08b4e11189 100644 --- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h @@ -40,7 +40,7 @@ public: } /** Set the input and output of the kernel * - * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/U16/S16/QASYMM8/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h index c30a4cd23d..25c3196e34 100644 --- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h @@ -43,13 +43,13 @@ public: NELogits1DMaxKernel(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types supported: same as @p input */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel * - * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor. Data types supported: same as @p input * * @return a status @@ -90,7 +90,7 @@ public: ~NELogits1DSoftmaxKernel() = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[in] max Max values tensor. Same shape as input with dimension 0 set to 1. * Data types supported: same as @p input. * @param[out] output Destination tensor. Data types supported: same as @p input. @@ -101,7 +101,7 @@ public: void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp); /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QS8/QS16/F16/F32. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. * Data types supported: same as @p input. * @param[in] output Destination tensor info. Data types supported: same as @p input. diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h index dc7ef8ff7a..76823acfa1 100644 --- a/arm_compute/core/NEON/kernels/NETransposeKernel.h +++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h @@ -57,13 +57,13 @@ public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] output Output tensor. Data type supported: Same as @p input * * @return a status @@ -76,7 +76,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h index 1a7525bfc7..21f36f6c2b 100644 --- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h +++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h @@ -75,7 +75,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32 * @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. @@ -85,7 +85,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h index fee206638b..fd0c0f0c34 100644 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h +++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -45,13 +45,11 @@ inline float32x4x3_t load_matrix_row(const float *ptr) } template -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position); +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) { - ARM_COMPUTE_UNUSED(fixed_point_position); - const float32x4x3_t vtop = { { @@ -108,9 +106,9 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) { - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); @@ -118,9 +116,9 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) { - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); return out; } diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index 908fa13876..d56fd44700 100644 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -55,29 +55,6 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) return r; } -/** Loads a 3x3 matrix as a row (qint8_t). - * - * @param[in] ptr Pointer to a qint8 3x3 matrix. - * @param[in] weights_offset (Optional) Weights quantization offset. - * - * @return The loaded matrix. - */ -inline qint8x8x3_t load_matrix_row(const qint8_t *ptr, int weights_offset = 0) -{ - ARM_COMPUTE_UNUSED(weights_offset); - /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: - r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - const qint8x8x3_t r = - { - { - vld1_dup_qs8(ptr), - vld1_dup_qs8(1 + ptr), - vld1_dup_qs8(2 + ptr) - } - }; - return r; -} - /** Loads a 3x3 matrix as a row (uint8_t). * * @param[in] ptr Pointer to a uint8_t 3x3 matrix. @@ -104,27 +81,25 @@ inline int32x4x3_t load_matrix_row(const uint8_t *ptr, int weights_offset = 0) /** Perform a convolve3x3 on float32. * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. - * @param[in] input_offset (Optional) Input quantization offset. + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] input_offset (Optional) Input quantization offset. * */ template float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset = 0); + int input_offset = 0); template <> inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); ARM_COMPUTE_UNUSED(input_offset); const float32x4x3_t vtop = @@ -185,11 +160,11 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c template <> inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); @@ -199,145 +174,35 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c template <> inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); return out; } -/** Perform a convolve3x3 on qint16. - * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. - * @param[in] input_offset (Optional) Input quantization offset. - * - */ -template -qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset = 0); - -template <> -inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset) -{ - ARM_COMPUTE_UNUSED(fixed_point_position); - ARM_COMPUTE_UNUSED(input_offset); - - const qint8x8x3_t vtop = - { - { - vld1_qs8(in_top), - vld1_qs8(in_top + 8), - vld1_qs8(in_top + 16) - } - }; - const qint8x8x3_t vmid = - { - { - vld1_qs8(in_mid), - vld1_qs8(in_mid + 8), - vld1_qs8(in_mid + 16) - } - }; - const qint8x8x3_t vlow = - { - { - vld1_qs8(in_low), - vld1_qs8(in_low + 8), - vld1_qs8(in_low + 16) - } - }; - qint16x8x2_t out = - { - { - vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position), - vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position) - } - }; - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position); - out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position); - out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position); - return out; -} - -template <> -inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset) -{ - ARM_COMPUTE_UNUSED(input_offset); - - qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7); - return out; -} - -template <> -inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, - const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, - int fixed_point_position, int input_offset) -{ - ARM_COMPUTE_UNUSED(input_offset); - - qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2); - out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3); - return out; -} - /** Perform a convolve3x3 on uint8_t * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. - * @param[in] input_offset (Optional) Input quantization offset. + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] input_offset (Optional) Input quantization offset. * */ template int32x4x2_t convolve_3x3(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset); + int input_offset); template <> inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); - const int32x4_t v_input_offset = vdupq_n_s32(input_offset); const uint8x8x2_t vtop = @@ -427,11 +292,9 @@ inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, template <> inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); - - int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3); @@ -441,10 +304,9 @@ inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, template <> inline int32x4x2_t convolve_3x3<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - int fixed_point_position, int input_offset) + int input_offset) { - ARM_COMPUTE_UNUSED(fixed_point_position); - int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset); + int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset); out.val[0] = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1); return out; } @@ -477,34 +339,6 @@ inline void store_results<3>(float *buffer, const float32x4x2_t &values) vst1_f32(buffer, vget_low_f32(values.val[0])); } -/** Stores a qint16_t array into a memory location. - * - * @param[in] buffer Pointer to the memory location where the values will be stored. - * @param[in] values Values that will be stored. - * - */ -template -void store_results(qint16_t *buffer, const qint16x8x2_t &values); - -template <> -inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1q_qs16(buffer, values.val[0]); - vst1q_qs16(buffer + 8, values.val[1]); -} - -template <> -inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1q_qs16(buffer, values.val[0]); -} - -template <> -inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1_qs16(buffer, vget_low_s16(values.val[0])); -} - /** Stores a uint32_t array into a memory location. * * @param[in] buffer Pointer to the memory location where the values will be stored. @@ -557,25 +391,20 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr) /** Perform a convolve3x3 on float16. * - * @param[in] in_top Pointer to the first row of the input. - * @param[in] in_mid Pointer to the second row of the input. - * @param[in] in_low Pointer to the third row of the input. - * @param[in] m0 First row of the filter. - * @param[in] m1 Second row of the filter. - * @param[in] m2 Third row of the filter. - * @param[in] fixed_point_position (Optional) Fixed point position. + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. * */ template -float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position); +float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2); template <> -inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position) +inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2) { - ARM_COMPUTE_UNUSED(fixed_point_position); - const float16x8x3_t vtop = { { @@ -627,10 +456,9 @@ inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *i } template <> -inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position) +inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2) { - float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3); @@ -638,10 +466,9 @@ inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *i } template <> -inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - int fixed_point_position) +inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2) { - float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); return out; } -- cgit v1.2.1