aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/NEFixedPoint.inl
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-06-28 18:29:47 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:15:39 +0100
commit9247c92bd8c53be4d0c4ae931f51ca8f88e4150b (patch)
tree3d457a263c0aa6ddcf3d05a4a2323640c486aa36 /arm_compute/core/NEON/NEFixedPoint.inl
parent097967568f9363d06df3ac21403edcab57de39d7 (diff)
downloadComputeLibrary-9247c92bd8c53be4d0c4ae931f51ca8f88e4150b.tar.gz
COMPMID-428: Port NESoftmaxLayer to 16-bit fixed point.
Change-Id: I65122950bab9124b9758c27096c0f458b77aeabb Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79365 Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Steven Niu <steven.niu@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/NEFixedPoint.inl')
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl64
1 files changed, 62 insertions, 2 deletions
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index b241dd5069..f62a338a61 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -384,6 +384,11 @@ inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
return vqadd_s16(a, b);
}
+inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
+{
+ return vqadd_s32(a, b);
+}
+
inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
{
return vqaddq_s8(a, b);
@@ -394,6 +399,11 @@ inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
return vqaddq_s16(a, b);
}
+inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
+{
+ return vqaddq_s32(a, b);
+}
+
inline int16x4_t vpaddl_qs8(qint8x8_t a)
{
return vpaddl_s8(a);
@@ -1073,6 +1083,56 @@ inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
return vshl_s16(x, shift_value);
}
+inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
+{
+ // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+ const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
+ const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
+ const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
+
+ // Find shift value
+ const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+ const qint8x8_t temp = vqshl_s8(a, shift_value);
+
+ qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
+
+ uint8x8_t set_one = vcgt_s8(x, const_one);
+ x = vbsl_s8(set_one, const_one, x);
+
+ // Use three iterations of Newton-Raphson method to get the result
+ x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+ x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+ x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+ return vqshl_s8(x, shift_value);
+}
+
+inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
+{
+ // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
+ const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
+ const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
+ const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
+
+ // Find shift value
+ const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
+ const qint16x4_t temp = vqshl_s16(a, shift_value);
+
+ qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
+
+ uint16x4_t set_one = vcgt_s16(x, const_one);
+ x = vbsl_s16(set_one, const_one, x);
+
+ // Use five iterations of Newton-Raphson method to get the result
+ x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+ x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+ x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+ x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+ x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+
+ return vqshl_s16(x, shift_value);
+}
+
inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
{
// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
@@ -1817,7 +1877,7 @@ inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
qint8x8_t num = vqsub_qs8(exp2x, const_one);
qint8x8_t den = vqadd_qs8(exp2x, const_one);
- qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
+ qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
return tanh;
}
@@ -1830,7 +1890,7 @@ inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
qint16x4_t num = vqsub_qs16(exp2x, const_one);
qint16x4_t den = vqadd_qs16(exp2x, const_one);
- qint16x4_t tanh = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
+ qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
return tanh;
}