aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/NEMath.inl
diff options
context:
space:
mode:
authorSang-Hoon Park <sang-hoon.park@arm.com>2019-11-22 16:05:46 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-12-20 11:08:34 +0000
commitc3a74200a66ea5fb718b8406bed2043bc097930e (patch)
tree1c0f56d82d5ac29219c2bbad119df211f82dda80 /arm_compute/core/NEON/NEMath.inl
parentd817647a4fabc8eccd0e64f54465e378a4239b32 (diff)
downloadComputeLibrary-c3a74200a66ea5fb718b8406bed2043bc097930e.tar.gz
COMPMID-2775 [NE] add support for QASYMM8_SIGNED to SoftmaxLayer
Change-Id: Ic46d4143929c8c9b548355d85c78542faf25d612 Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/2376 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/NEMath.inl')
-rw-r--r--arm_compute/core/NEON/NEMath.inl25
1 files changed, 24 insertions, 1 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 179f1b6299..5d8b82c281 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -331,6 +331,20 @@ inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
return out;
}
+inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in)
+{
+ float32x4x4_t out;
+
+ const auto tmp1 = vmovl_s8(vget_low_s8(in));
+ out.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1)));
+ out.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1)));
+
+ const auto tmp2 = vmovl_s8(vget_high_s8(in));
+ out.val[2] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2)));
+ out.val[3] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2)));
+ return out;
+}
+
inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
{
out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
@@ -341,7 +355,7 @@ inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const flo
vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
}
-inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
+inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
{
const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
vqmovn_u32(vcvtq_u32_f32(in.val[1])));
@@ -350,6 +364,15 @@ inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t
out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
}
+inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
+{
+ const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
+ vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+ const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
+ vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+ out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+}
+
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
/** Exponent polynomial coefficients */
/** Logarithm polynomial coefficients */