COMPMID-2700: Use NEON wrapper on SoftmaxLayer

Change-Id: Id8901e865c9f355dcf7b2a1a539493099591377e Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/2186 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Manuel Bottini <manuel.bottini@arm.com> 2019-10-29 17:20:09 +0000
committer: Manuel Bottini <manuel.bottini@arm.com> 2019-11-25 18:13:09 +0000
commit: 21079dd320c00068208acdfd59177895265a53f2 (patch)
tree: 76a9f889260146a40cb50023925941418c3b4704 /arm_compute/core/NEON/NEMath.inl
parent: 6d8b94ac6864dfd7ad38bc110006bdca5ee0f266 (diff)
download: ComputeLibrary-21079dd320c00068208acdfd59177895265a53f2.tar.gz
1 files changed, 33 insertions, 0 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index f1c9c2024b..a3601f6a25 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -317,6 +317,39 @@ inline int32_t rounding_divide_by_pow2(int32_t x, int exponent)
     return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
 }
 
+inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
+{
+    float32x4x4_t out;
+
+    const auto tmp1 = vmovl_u8(vget_low_u8(in));
+    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
+    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
+
+    const auto tmp2 = vmovl_u8(vget_high_u8(in));
+    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
+    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
+    return out;
+}
+
+inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
+{
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+}
+
+inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
+{
+    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
+                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
+                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
 /** Logarithm polynomial coefficients */
author	Manuel Bottini <manuel.bottini@arm.com>	2019-10-29 17:20:09 +0000
committer	Manuel Bottini <manuel.bottini@arm.com>	2019-11-25 18:13:09 +0000
commit	21079dd320c00068208acdfd59177895265a53f2 (patch)
tree	76a9f889260146a40cb50023925941418c3b4704 /arm_compute/core/NEON/NEMath.inl
parent	6d8b94ac6864dfd7ad38bc110006bdca5ee0f266 (diff)
download	ComputeLibrary-21079dd320c00068208acdfd59177895265a53f2.tar.gz