diff options
Diffstat (limited to 'arm_compute/core/NEON/NEFixedPoint.h')
-rw-r--r-- | arm_compute/core/NEON/NEFixedPoint.h | 531 |
1 files changed, 506 insertions, 25 deletions
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 201c5b5e7e..660464eb62 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -46,6 +46,7 @@ using qint16x8_t = int16x8_t; /**< 16 bit fixed point vector with 8 elements using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */ using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */ +using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */ /** Get the lower half of a 16 elements vector * @@ -55,6 +56,14 @@ using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 element */ qint8x8_t vget_low_qs8(qint8x16_t a); +/** Get the lower half of a 16 elements vector + * + * @param[in] a vector of 8 elements + * + * @return 16 bit fixed point vector (4 elements) + */ +qint16x4_t vget_low_qs16(qint16x8_t a); + /** Get the higher half of a 16 elements vector * * @param[in] a vector of 16 elements @@ -63,21 +72,21 @@ qint8x8_t vget_low_qs8(qint8x16_t a); */ qint8x8_t vget_high_qs8(qint8x16_t a); -/** Load a single 8 bit fixed point vector from memory (8 elements) +/** Get the higher half of a 16 elements vector * - * @param[in] addr Memory address of the 8 bit fixed point vector to load + * @param[in] a vector of 8 elements * - * @return 8 bit fixed point vector (8 elements) + * @return 16 bit fixed point vector (4 elements) */ -qint8x8_t vld1_qs8(const qint8_t *addr); +qint16x4_t vget_high_qs16(qint16x8_t a); -/** Load a single 8 bit fixed point vector from memory (16 elements) +/** Load a single 8 bit fixed point vector from memory (8 elements) * * @param[in] addr Memory address of the 8 bit fixed point vector to load * - * @return 8 bit fixed point vector (16 elements) + * @return 8 bit fixed point vector (8 elements) */ -qint8x16_t vld1q_qs8(const qint8_t *addr); +qint8x8_t vld1_qs8(const qint8_t *addr); /** Load a single 16 bit fixed point vector from memory (4 elements) * @@ -87,6 +96,14 @@ qint8x16_t vld1q_qs8(const qint8_t *addr); */ qint16x4_t vld1_qs16(const qint16_t *addr); +/** Load a single 8 bit fixed point vector from memory (16 elements) + * + * @param[in] addr Memory address of the 8 bit fixed point vector to load + * + * @return 8 bit fixed point vector (16 elements) + */ +qint8x16_t vld1q_qs8(const qint8_t *addr); + /** Load a single 16 bit fixed point vector from memory (8 elements) * * @param[in] addr Memory address of the 16 bit fixed point vector to load @@ -103,6 +120,14 @@ qint16x8_t vld1q_qs16(const qint16_t *addr); */ qint8x8_t vld1_dup_qs8(const qint8_t *addr); +/** Load all lanes of 16 bit fixed point vector with same value from memory (4 elements) + * + * @param[in] addr Memory address of the 16 bit fixed point scalar value to load + * + * @return 16 bit fixed point vector (4 elements) + */ +qint16x4_t vld1_dup_qs16(const qint16_t *addr); + /** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements) * * @param[in] addr Memory address of the 8 bit fixed point scalar value to load @@ -111,21 +136,21 @@ qint8x8_t vld1_dup_qs8(const qint8_t *addr); */ qint8x16_t vld1q_dup_qs8(const qint8_t *addr); -/** Store a single 8 bit fixed point vector to memory (8 elements) +/** Load all lanes of 16 bit fixed point vector with same value from memory (8 elements) * - * @param[in] addr Memory address where the 8 bit fixed point vector should be stored - * @param[in] b 8 bit fixed point vector to store + * @param[in] addr Memory address of the 16 bit fixed point scalar value to load * + * @return 16 bit fixed point vector (8 elements) */ -void vst1_qs8(qint8_t *addr, qint8x8_t b); +qint16x8_t vld1q_dup_qs16(const qint16_t *addr); -/** Store a single 8 bit fixed point vector to memory (16 elements) +/** Store a single 8 bit fixed point vector to memory (8 elements) * * @param[in] addr Memory address where the 8 bit fixed point vector should be stored * @param[in] b 8 bit fixed point vector to store * */ -void vst1q_qs8(qint8_t *addr, qint8x16_t b); +void vst1_qs8(qint8_t *addr, qint8x8_t b); /** Store a single 16 bit fixed point vector to memory (4 elements) * @@ -137,10 +162,18 @@ void vst1_qs16(qint16_t *addr, qint16x4_t b); /** Store a single 8 bit fixed point vector to memory (16 elements) * - * @param[in] addr Memory address where the 16 bit fixed point vector should be stored - * @param[in] b 16 bit fixed point vector to store + * @param[in] addr Memory address where the 8 bit fixed point vector should be stored + * @param[in] b 8 bit fixed point vector to store * */ +void vst1q_qs8(qint8_t *addr, qint8x16_t b); + +/** Store a single 16 bit fixed point vector to memory (8 elements) +* +* @param[in] addr Memory address where the 16 bit fixed point vector should be stored +* @param[in] b 16 bit fixed point vector to store +* +*/ void vst1q_qs16(qint16_t *addr, qint16x8_t b); /** 16 bit fixed point vector saturating narrow (8 elements) @@ -151,6 +184,14 @@ void vst1q_qs16(qint16_t *addr, qint16x8_t b); */ qint8x8_t vqmovn_q16(qint16x8_t a); +/** 32 bit fixed point vector saturating narrow (4 elements) + * + * @param[in] a 32 bit fixed point vector to convert + * + * @return 16 bit fixed point vector + */ +qint16x4_t vqmovn_q32(qint32x4_t a); + /** 8 bit fixed point vector duplicate (8 elements) * * @param[in] a 8 bit fixed point to duplicate @@ -159,6 +200,14 @@ qint8x8_t vqmovn_q16(qint16x8_t a); */ qint8x8_t vdup_n_qs8(qint8_t a); +/** 16 bit fixed point vector duplicate (4 elements) + * + * @param[in] a 16 bit fixed point to duplicate + * + * @return The result of the vector duplication + */ +qint16x4_t vdup_n_qs16(qint16_t a); + /** 8 bit fixed point vector duplicate (16 elements) * * @param[in] a 8 bit fixed point to duplicate @@ -192,6 +241,14 @@ qint16x8_t vdupq_n_qs16(qint16x8_t a); */ qint8x8_t vabs_qs8(qint8x8_t a); +/** Absolute value of 16 bit fixed point vector (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector absolute value + */ +qint16x4_t vabs_qs16(qint16x4_t a); + /** Absolute value of 8 bit fixed point vector (16 elements) * * @param[in] a 8 bit fixed point input vector @@ -200,6 +257,14 @@ qint8x8_t vabs_qs8(qint8x8_t a); */ qint8x16_t vabsq_qs8(qint8x16_t a); +/** Absolute value of 16 bit fixed point vector (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector absolute value + */ +qint16x8_t vabsq_qs16(qint16x8_t a); + /** Saturating absolute value of 8 bit fixed point vector (8 elements) * * @param[in] a 8 bit fixed point input vector @@ -208,6 +273,14 @@ qint8x16_t vabsq_qs8(qint8x16_t a); */ qint8x8_t vqabs_qs8(qint8x8_t a); +/** Saturating absolute value of 16 bit fixed point vector (4 elements) + * + * @param[in] a 4 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector absolute value + */ +qint16x4_t vqabs_qs16(qint16x4_t a); + /** Saturating absolute value of 8 bit fixed point vector (16 elements) * * @param[in] a 8 bit fixed point input vector @@ -216,6 +289,14 @@ qint8x8_t vqabs_qs8(qint8x8_t a); */ qint8x16_t vqabsq_qs8(qint8x16_t a); +/** Saturating absolute value of 16 bit fixed point vector (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector absolute value + */ +qint16x8_t vqabsq_qs16(qint16x8_t a); + /** 8 bit fixed point vector max (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -225,6 +306,15 @@ qint8x16_t vqabsq_qs8(qint8x16_t a); */ qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector max (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector max operation + */ +qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector max (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -234,6 +324,15 @@ qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b); */ qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b); +/** 16 bit fixed point vector max (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector max operation + */ +qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b); + /** 8 bit fixed point vector pairwise max (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -243,6 +342,15 @@ qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b); */ qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector pairwise max (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector pairwise max operation + */ +qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector min (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -252,6 +360,15 @@ qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b); */ qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector min (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector max operation + */ +qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector min (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -261,6 +378,15 @@ qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b); */ qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b); +/** 16 bit fixed point vector min (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector min operation + */ +qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b); + /** 8 bit fixed point vector pairwise min (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -270,6 +396,15 @@ qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b); */ qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector pairwise min (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector pairwise min operation + */ +qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector add (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -279,6 +414,15 @@ qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b); */ qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector add (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector addition + */ +qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector add (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -288,23 +432,23 @@ qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b); */ qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b); -/** 8 bit fixed point vector saturating add (8 elements) +/** 16 bit fixed point vector add (8 elements) * - * @param[in] a First 8 bit fixed point input vector - * @param[in] b Second 8 bit fixed point input vector + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector * - * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow + * @return The result of the 16 bit fixed point vector addition */ -qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b); +qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b); -/** 8 bit fixed point vector saturating add (16 elements) +/** 8 bit fixed point vector saturating add (8 elements) * * @param[in] a First 8 bit fixed point input vector * @param[in] b Second 8 bit fixed point input vector * * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow */ -qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b); +qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b); /** 16 bit fixed point vector saturating add (4 elements) * @@ -315,6 +459,15 @@ qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b); */ qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b); +/** 8 bit fixed point vector saturating add (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow + */ +qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b); + /** 16 bit fixed point vector saturating add (8 elements) * * @param[in] a First 16 bit fixed point input vector @@ -341,6 +494,15 @@ int16x4_t vpaddl_qs8(qint8x8_t a); */ qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector subtraction (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector subtraction + */ +qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector subtraction (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -350,6 +512,15 @@ qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b); */ qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b); +/** 16 bit fixed point vector subtraction (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector subtraction + */ +qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b); + /** 8 bit fixed point vector saturating subtraction (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -359,6 +530,15 @@ qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b); */ qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b); +/** 16 bit fixed point vector saturating subtraction (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow + */ +qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b); + /** 8 bit fixed point vector saturating subtraction (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -368,6 +548,15 @@ qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b); */ qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b); +/** 16 bit fixed point vector saturating subtraction (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow + */ +qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b); + /** 8 bit fixed point vector multiply (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -378,6 +567,16 @@ qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b); */ qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); +/** 16 bit fixed point vector multiply (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiplication. + */ +qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); + /** 8 bit fixed point vector multiply (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -388,6 +587,16 @@ qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); */ qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); +/** 16 bit fixed point vector multiply (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiplication. + */ +qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); + /** 8 bit fixed point vector saturating multiply (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -398,6 +607,16 @@ qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); */ qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); +/** 16 bit fixed point vector saturating multiply (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow + */ +qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); + /** 8 bit fixed point vector saturating multiply (16 elements) * * @param[in] a First 8 bit fixed point input vector @@ -408,6 +627,16 @@ qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); */ qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); +/** 16 bit fixed point vector saturating multiply (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow + */ +qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); + /** 8 bit fixed point vector long multiply (8 elements) * * @param[in] a First 8 bit fixed point input vector @@ -429,6 +658,17 @@ qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); */ qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); +/** 16 bit fixed point vector multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 16 bit fixed point input vector + * @param[in] c Third 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiply-accumulate + */ +qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); + /** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). * * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to @@ -440,6 +680,17 @@ qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_positi */ qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position); +/** 16 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 16 bit fixed point input vector + * @param[in] c Third 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiply-accumulate + */ +qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position); + /** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). * * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to @@ -451,6 +702,17 @@ qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_p */ qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); +/** 16 bit fixed point vector saturating multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 16 bit fixed point input vector + * @param[in] c Third 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow + */ +qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); + /** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). * * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to @@ -462,6 +724,17 @@ qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_posit */ qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position); +/** 16 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 16 bit fixed point input vector + * @param[in] c Third 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow + */ +qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position); + /** 8 bit fixed point vector multiply-accumulate long (8 elements). * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements * @@ -474,6 +747,18 @@ qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_ */ qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); +/** 16 bit fixed point vector multiply-accumulate long (4 elements). + * This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements + * + * @param[in] a First 32 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 16 bit fixed point input vector + * @param[in] c Third 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiply-accumulate long + */ +qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); + /** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector. * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements * @@ -486,6 +771,18 @@ qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_pos */ qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); +/** 16 bit fixed point vector saturating multiply-accumulate long (4 elements). The saturation is performed on the 16 bit fixed point output vector. + * This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements + * + * @param[in] a First 32 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 16 bit fixed point input vector + * @param[in] c Third 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point vector multiply-accumulate long + */ +qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position); + /** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements * * @param[in] a Float input vector @@ -493,7 +790,16 @@ qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_po * * @return The result of the conversion float -> 8 bit fixed point */ -qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position); +qint8x8_t vcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position); + +/** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements + * + * @param[in] a Float input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion float -> 16 bit fixed point + */ +qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position); /** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements * @@ -504,6 +810,15 @@ qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position); */ qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position); +/** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements + * + * @param[in] a Float input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion float -> 16 bit fixed point + */ +qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position); + /** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements * * @param[in] a 8 bit fixed point input vector @@ -513,6 +828,15 @@ qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position); */ float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position); +/** Convert a 16 bit fixed point vector with 4 elements to a float vector with 4 elements + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion 16 bit fixed point -> float32x2 + */ +float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position); + /** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements * * @param[in] a 8 bit fixed point input vector @@ -522,6 +846,15 @@ float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position); */ float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position); +/** Convert a 16 bit fixed point vector with 8 elements to a float vector with 4x2 elements + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion 16 bit fixed point -> float32x4x2 + */ +float32x4x2_t vcvtq_qs16_f32(qint16x8_t a, int fixed_point_position); + /** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements) * * @param[in] a 8bit fixed point input vector @@ -531,6 +864,15 @@ float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position); */ qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position); +/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit reciprocal (1/a). + */ +qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position); + /** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements) * * @param[in] a 8bit fixed point input vector @@ -540,6 +882,15 @@ qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position); */ qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position); +/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit reciprocal (1/a). + */ +qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position); + /** Division fixed point 8bit (8 elements) * * @param[in] a First 8bit fixed point input vector @@ -550,6 +901,16 @@ qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position); */ qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position); +/** Division fixed point 16 bit (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The quotient and remainder number in fixed point format. + */ +qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position); + /** Division fixed point 8bit (16 elements) * * @param[in] a First 8bit fixed point input vector @@ -558,7 +919,17 @@ qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position); * * @return The quotient and remainder number in 8bit fixed point format. */ -qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position); +qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); + +/** Division fixed point 16 bit (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The quotient and remainder number in 16 bit fixed point format. + */ +qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position); /** Perform a 4th degree polynomial approximation. (8 elements) * @@ -570,6 +941,16 @@ qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position); template <bool islog> qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position); +/** Perform a 4th degree polynomial approximation. (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit taylor approximation. + */ +template <bool islog> +qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position); + /** Perform a 4th degree polynomial approximation. (16 elements) * * @param[in] a 8bit fixed point input vector @@ -580,6 +961,16 @@ qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position); template <bool islog> qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position); +/** Perform a 4th degree polynomial approximation. (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit taylor approximation. + */ +template <bool islog> +qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position); + /** Calculate saturating exponential fixed point 8bit (8 elements) * * @param[in] a 8bit fixed point input vector @@ -589,6 +980,15 @@ qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position); */ qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position); +/** Calculate saturating exponential fixed point 16 bit (4 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit saturating exponential + */ +qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position); + /** Calculate saturating exponential fixed point 8bit (16 elements) * * @param[in] a 8bit fixed point input vector @@ -598,7 +998,16 @@ qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position); */ qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position); -/** Calculate logarithm fixed point 16bit (8 elements) +/** Calculate saturating exponential fixed point 16 bit (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit saturating exponential + */ +qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position); + +/** Calculate logarithm fixed point 8 bit (8 elements) * * @param[in] a 8bit fixed point input vector * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number @@ -607,6 +1016,15 @@ qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position); */ qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position); +/** Calculate logarithm fixed point 16 bit (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit logarithm. + */ +qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position); + /** Calculate logarithm fixed point 16bit (16 elements) * * @param[in] a 8bit fixed point input vector @@ -616,6 +1034,15 @@ qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position); */ qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position); +/** Calculate logarithm fixed point 16 bit (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit logarithm. + */ +qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position); + /** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) * * @param[in] a 8bit fixed point input vector @@ -625,6 +1052,15 @@ qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position); */ qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position); +/** Calculate inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit inverse sqrt. + */ +qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position); + /** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) * * @param[in] a 8bit fixed point input vector @@ -634,6 +1070,15 @@ qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position); */ qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position); +/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit inverse sqrt. + */ +qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position); + /** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) * * @param[in] a 8bit fixed point input vector @@ -643,6 +1088,15 @@ qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position); */ qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); +/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit inverse sqrt. + */ +qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position); + /** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) * * @param[in] a 8bit fixed point input vector @@ -652,6 +1106,15 @@ qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); */ qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); +/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit inverse sqrt. + */ +qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position); + /** Calculate hyperbolic tangent for fixed point 8bit (8 elements) * * @param[in] a 8bit fixed point input vector @@ -661,6 +1124,15 @@ qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); */ qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position); +/** Calculate hyperbolic tangent for fixed point 16 bit (4 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The calculated Hyperbolic Tangent. + */ +qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position); + /** Calculate hyperbolic tangent for fixed point 8bit (16 elements) * * @param[in] a 8bit fixed point input vector @@ -690,6 +1162,15 @@ qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position); * @return The lane-by-lane maximum -> float32x4x2 */ float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b); + +/** Calculate hyperbolic tangent for fixed point 8bit (8 elements) + * + * @param[in] a 16 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The calculated Hyperbolic Tangent. + */ +qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position); } #include "arm_compute/core/NEON/NEFixedPoint.inl" #endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */ |