From f87cc7f6fef95f9b022725304118796a6a764a7c Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 26 Jul 2017 10:28:40 +0100 Subject: COMPMID-417: Port NEDirectConvolution 1x1 to QS16. Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662 Reviewed-by: Moritz Pflanzer Tested-by: Kaizen Reviewed-by: Anthony Barbier --- arm_compute/core/NEON/NEFixedPoint.h | 11 +++++++++++ arm_compute/core/NEON/NEFixedPoint.inl | 14 ++++++++++++++ .../kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h | 2 +- .../core/NEON/kernels/NEDirectConvolutionLayerKernel.h | 2 +- arm_compute/core/Types.h | 1 + arm_compute/core/Utils.h | 3 +++ .../runtime/NEON/functions/NEDirectConvolutionLayer.h | 2 +- 7 files changed, 32 insertions(+), 3 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 08f680801d..3de226112e 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -48,6 +48,7 @@ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 element using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */ using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */ using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */ +using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */ /** Get the lower half of a 16 elements vector * @@ -673,6 +674,16 @@ qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); */ qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); +/** 16 bit fixed point vector long multiply (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 32 bit fixed point long vector multiplication. + */ +qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); + /** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). * * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index c879d3e275..dd1066d6bc 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -624,6 +624,20 @@ inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) return vqrshlq_s16(res, fixed_point_position_s16); } +inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position) +{ + const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + tmp = vmull_s16(a, b); + + // Shift right by fixed_point_position + return vqshlq_s32(tmp, fixed_point_position_s32); +} + inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) { const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h index f098e18655..87788ba389 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h @@ -51,7 +51,7 @@ public: /** Set the accumulate buffer and the biases of the kernel. * * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. - * Data type supported: QS8/F32 + * Data type supported: QS8/QS16/F16/F32 * @param[in] bias The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) * Data type supported: Same as @p input diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h index 5612e1ae62..e0dac9858b 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -49,7 +49,7 @@ public: /** Set the input, weights, and output tensors. * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 765cae4ad4..2d3b3d6f66 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -68,6 +68,7 @@ enum class DataType QS16, U32, S32, + QS32, U64, S64, F16, diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 4ecd464cdb..af788beeb7 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -100,6 +100,7 @@ inline size_t data_size_from_type(DataType data_type) case DataType::F32: case DataType::U32: case DataType::S32: + case DataType::QS32: return 4; case DataType::F64: case DataType::U64: @@ -173,6 +174,7 @@ inline size_t element_size_from_data_type(DataType dt) case DataType::U32: case DataType::S32: case DataType::F32: + case DataType::QS32: return 4; default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); @@ -645,6 +647,7 @@ inline bool is_data_type_fixed_point(DataType dt) { case DataType::QS8: case DataType::QS16: + case DataType::QS32: return true; default: return false; diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h index a66cab3013..872fae3a6b 100644 --- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h @@ -48,7 +48,7 @@ public: NEDirectConvolutionLayer(); /** Set the input, weights, biases and output tensors. * - * @param[in, out] input Input tensor. Data types supported: QS8/F16/F32. + * @param[in, out] input Input tensor. Data types supported: QS8/QS16/F16/F32. * @param[in] weights Set of kernels to convolve the input volume. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported: Same as @p input. -- cgit v1.2.1