From d5e65c71261fd42d3e69478507fbfcc8cf36befc Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 26 Jul 2017 17:09:17 +0100 Subject: COMPMID-456: Add support for QS16 NEON Normalization Layer. Change-Id: I1e542808cfd7774c67cc4e9a58e42449e4fb29aa Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81735 Tested-by: Kaizen Reviewed-by: Anthony Barbier --- arm_compute/core/NEON/NEFixedPoint.h | 25 ++++++++++++++++++++-- arm_compute/core/NEON/NEFixedPoint.inl | 17 +++++++++++++++ .../core/NEON/kernels/NENormalizationLayerKernel.h | 4 ++-- 3 files changed, 42 insertions(+), 4 deletions(-) (limited to 'arm_compute/core') diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 50463b5efe..08f680801d 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -235,13 +235,22 @@ qint8x16_t vdupq_n_qs8(qint8_t a); /** Duplicate a float and convert it to 8 bit fixed point vector (16 elements) * - * @param[in] a 8 bit fixed point to duplicate + * @param[in] a floating point value to convert and duplicate * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number * * @return The result of the vector duplication */ qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position); +/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements) + * + * @param[in] a floating point value to convert and duplicate + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the vector duplication + */ +qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position); + /** 16 bit fixed point vector duplicate (8 elements) * * @param[in] a 16 bit fixed point to duplicate @@ -1178,7 +1187,19 @@ qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position); * * @return The result of the 8bit power. */ -qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position); +qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); + +/** Calculate saturating n power for fixed point 16bit (8 elements). + * + * pow(a,b) = e^(b*log(a)) + * + * @param[in] a 16bit fixed point input vector + * @param[in] b 16bit fixed point power vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16bit power. + */ +qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); /** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements * diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index 7cebfad924..c879d3e275 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -250,6 +250,18 @@ inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position) return vqcvtq_qs8_f32(res, fixed_point_position); } +inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position) +{ + float32x4x2_t res = + { + { + vdupq_n_f32(a), + vdupq_n_f32(a), + } + }; + return vqcvtq_qs16_f32(res, fixed_point_position); +} + inline qint16x8_t vdupq_n_qs16(qint16_t a) { return vdupq_n_s16(a); @@ -1941,6 +1953,11 @@ inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_positio return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position); } +inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position) +{ + return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position); +} + inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b) { float32x4x2_t res = diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h index b1bc594e4c..e24e481f46 100644 --- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h @@ -50,7 +50,7 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32. + * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32. * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], * Data type supported: same as @p input * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input @@ -86,7 +86,7 @@ private: * * @param[in] window Region on which to execute the kernel. */ - template + template void normalize_fixed_point(const Window &window); /** Common signature for all the specialised normalization functions * -- cgit v1.2.1