aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/wrapper
diff options
context:
space:
mode:
authorSang-Hoon Park <sang-hoon.park@arm.com>2020-07-16 14:26:16 +0100
committerSang-Hoon Park <sang-hoon.park@arm.com>2020-07-28 08:17:55 +0000
commit3351f2a454a11e15934fa8bfac635785783cf8e1 (patch)
tree991c4f863af9bca765f25e3c1a91bb7fc1b2a75b /arm_compute/core/NEON/wrapper
parentad7515d231acb075a9585e52f257373b1a1b5d1f (diff)
downloadComputeLibrary-3351f2a454a11e15934fa8bfac635785783cf8e1.tar.gz
COMPMID-3575: Mixed preicision in NEInstanceNormalizationLayerKernel
In order to fix the issue caused by the limited precision of FP16. mixed precision (float accumulator) is introduced to NEInstanceNormalizationLayerKernel. Since the reference kernel is doing the mixed precision, currently mixed preicision computation is default when it is called from NEInstanceNormalizationLayer. - Make NEInstanceNormalizationLayerKernel use kernel descriptor to enable mixed precision computation - NEInstanceNormalizationLayer is modified to use the descriptor Change-Id: I7766622d715df054e303f9b441380b15b51f02b2 Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3589 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/wrapper')
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/cvt.h16
1 files changed, 16 insertions, 0 deletions
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
index de1261bdd0..6e79a92bc2 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
@@ -40,8 +40,24 @@ namespace wrapper
VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#undef VCVT_TO_F32_IMPL
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
+ template <typename T> \
+ inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
+ vcvt(const vtype &a) \
+ { \
+ return prefix##_##postfix1##_##postfix2(a); \
+ }
+
+VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
+#undef VCVT_TO_F16_IMPL
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
template <typename T>
inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type
vcvt(const float32x4_t &a)