From bcf8a968da4b26926df8bb770df16d82146bcb54 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Fri, 12 Oct 2018 10:51:31 +0100
Subject: COMPMID-1580 Implement ReduceMean in NEON

Change-Id: Id974efad304c2513b8824a6561ad45ee60b9e7fb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/153763
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
---
 arm_compute/core/NEON/wrapper/intrinsics/gethigh.h |  53 ++++++
 arm_compute/core/NEON/wrapper/intrinsics/getlane.h | 204 +++++++++++++++++++++
 arm_compute/core/NEON/wrapper/intrinsics/getlow.h  |  53 ++++++
 .../core/NEON/wrapper/intrinsics/intrinsics.h      |   6 +
 arm_compute/core/NEON/wrapper/intrinsics/load.h    |   6 +
 arm_compute/core/NEON/wrapper/intrinsics/movl.h    |  49 +++++
 arm_compute/core/NEON/wrapper/intrinsics/movn.h    |  62 +++++++
 arm_compute/core/NEON/wrapper/intrinsics/mul.h     |   6 +
 arm_compute/core/NEON/wrapper/intrinsics/padd.h    |  53 ++++++
 arm_compute/core/NEON/wrapper/intrinsics/store.h   |   6 +
 10 files changed, 498 insertions(+)
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/getlane.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/getlow.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/movl.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/movn.h
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/padd.h

(limited to 'arm_compute/core/NEON/wrapper/intrinsics')

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
new file mode 100644
index 0000000000..47b0116b84
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_GET_HIGH_H__
+#define __ARM_COMPUTE_WRAPPER_GET_HIGH_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \
+    inline half_vtype vgethigh(const vtype val)   \
+    {                                             \
+        return vget_high_##postfix(val);          \
+    }
+
+VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8)
+VGETHIGH_IMPL(int8x8_t, int8x16_t, s8)
+VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16)
+VGETHIGH_IMPL(int16x4_t, int16x8_t, s16)
+VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32)
+VGETHIGH_IMPL(int32x2_t, int32x4_t, s32)
+VGETHIGH_IMPL(float32x2_t, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETHIGH_IMPL(float16x4_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETHIGH_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_GET_HIGH_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
new file mode 100644
index 0000000000..107ce44e0c
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_GET_LANE_H__
+#define __ARM_COMPUTE_WRAPPER_GET_LANE_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETLANE_IMPL_8(stype, vtype, postfix)                \
+    inline stype vgetlane(const vtype vector, const int lane) \
+    {                                                         \
+        switch(lane)                                          \
+        {                                                     \
+            case 0:                                           \
+                return vget_lane_##postfix(vector, 0);        \
+            case 1:                                           \
+                return vget_lane_##postfix(vector, 1);        \
+            case 2:                                           \
+                return vget_lane_##postfix(vector, 2);        \
+            case 3:                                           \
+                return vget_lane_##postfix(vector, 3);        \
+            case 4:                                           \
+                return vget_lane_##postfix(vector, 4);        \
+            case 5:                                           \
+                return vget_lane_##postfix(vector, 5);        \
+            case 6:                                           \
+                return vget_lane_##postfix(vector, 6);        \
+            case 7:                                           \
+                return vget_lane_##postfix(vector, 7);        \
+            default:                                          \
+                ARM_COMPUTE_ERROR("Invalid lane");            \
+        }                                                     \
+    }
+
+#define VGETLANE_IMPL_4(stype, vtype, postfix)                \
+    inline stype vgetlane(const vtype vector, const int lane) \
+    {                                                         \
+        switch(lane)                                          \
+        {                                                     \
+            case 0:                                           \
+                return vget_lane_##postfix(vector, 0);        \
+            case 1:                                           \
+                return vget_lane_##postfix(vector, 1);        \
+            case 2:                                           \
+                return vget_lane_##postfix(vector, 2);        \
+            case 3:                                           \
+                return vget_lane_##postfix(vector, 3);        \
+            default:                                          \
+                ARM_COMPUTE_ERROR("Invalid lane");            \
+        }                                                     \
+    }
+
+#define VGETLANE_IMPL_2(stype, vtype, postfix)                \
+    inline stype vgetlane(const vtype vector, const int lane) \
+    {                                                         \
+        switch(lane)                                          \
+        {                                                     \
+            case 0:                                           \
+                return vget_lane_##postfix(vector, 0);        \
+            case 1:                                           \
+                return vget_lane_##postfix(vector, 1);        \
+            default:                                          \
+                ARM_COMPUTE_ERROR("Invalid lane");            \
+        }                                                     \
+    }
+
+VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8)
+VGETLANE_IMPL_8(int8_t, int8x8_t, s8)
+VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16)
+VGETLANE_IMPL_4(int16_t, int16x4_t, s16)
+VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32)
+VGETLANE_IMPL_2(int32_t, int32x2_t, s32)
+VGETLANE_IMPL_2(float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#define VGETQLANE_IMPL_16(stype, vtype, postfix)               \
+    inline stype vgetqlane(const vtype vector, const int lane) \
+    {                                                          \
+        switch(lane)                                           \
+        {                                                      \
+            case 0:                                            \
+                return vgetq_lane_##postfix(vector, 0);        \
+            case 1:                                            \
+                return vgetq_lane_##postfix(vector, 1);        \
+            case 2:                                            \
+                return vgetq_lane_##postfix(vector, 2);        \
+            case 3:                                            \
+                return vgetq_lane_##postfix(vector, 3);        \
+            case 4:                                            \
+                return vgetq_lane_##postfix(vector, 4);        \
+            case 5:                                            \
+                return vgetq_lane_##postfix(vector, 5);        \
+            case 6:                                            \
+                return vgetq_lane_##postfix(vector, 6);        \
+            case 7:                                            \
+                return vgetq_lane_##postfix(vector, 7);        \
+            case 8:                                            \
+                return vgetq_lane_##postfix(vector, 8);        \
+            case 9:                                            \
+                return vgetq_lane_##postfix(vector, 9);        \
+            case 10:                                           \
+                return vgetq_lane_##postfix(vector, 10);       \
+            case 11:                                           \
+                return vgetq_lane_##postfix(vector, 11);       \
+            case 12:                                           \
+                return vgetq_lane_##postfix(vector, 12);       \
+            case 13:                                           \
+                return vgetq_lane_##postfix(vector, 13);       \
+            case 14:                                           \
+                return vgetq_lane_##postfix(vector, 14);       \
+            case 15:                                           \
+                return vgetq_lane_##postfix(vector, 15);       \
+            default:                                           \
+                ARM_COMPUTE_ERROR("Invalid lane");             \
+        }                                                      \
+    }
+
+#define VGETQLANE_IMPL_8(stype, vtype, postfix)                \
+    inline stype vgetqlane(const vtype vector, const int lane) \
+    {                                                          \
+        switch(lane)                                           \
+        {                                                      \
+            case 0:                                            \
+                return vgetq_lane_##postfix(vector, 0);        \
+            case 1:                                            \
+                return vgetq_lane_##postfix(vector, 1);        \
+            case 2:                                            \
+                return vgetq_lane_##postfix(vector, 2);        \
+            case 3:                                            \
+                return vgetq_lane_##postfix(vector, 3);        \
+            case 4:                                            \
+                return vgetq_lane_##postfix(vector, 4);        \
+            case 5:                                            \
+                return vgetq_lane_##postfix(vector, 5);        \
+            case 6:                                            \
+                return vgetq_lane_##postfix(vector, 6);        \
+            case 7:                                            \
+                return vgetq_lane_##postfix(vector, 7);        \
+            default:                                           \
+                ARM_COMPUTE_ERROR("Invalid lane");             \
+        }                                                      \
+    }
+
+#define VGETQLANE_IMPL_4(stype, vtype, postfix)                \
+    inline stype vgetqlane(const vtype vector, const int lane) \
+    {                                                          \
+        switch(lane)                                           \
+        {                                                      \
+            case 0:                                            \
+                return vgetq_lane_##postfix(vector, 0);        \
+            case 1:                                            \
+                return vgetq_lane_##postfix(vector, 1);        \
+            case 2:                                            \
+                return vgetq_lane_##postfix(vector, 2);        \
+            case 3:                                            \
+                return vgetq_lane_##postfix(vector, 3);        \
+            default:                                           \
+                ARM_COMPUTE_ERROR("Invalid lane");             \
+        }                                                      \
+    }
+
+VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8)
+VGETQLANE_IMPL_16(int8_t, int8x16_t, s8)
+VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16)
+VGETQLANE_IMPL_8(int16_t, int16x8_t, s16)
+VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32)
+VGETQLANE_IMPL_4(int32_t, int32x4_t, s32)
+VGETQLANE_IMPL_4(float, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETQLANE_IMPL_8(float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETLANE_IMPL_8
+#undef VGETLANE_IMPL_4
+#undef VGETLANE_IMPL_2
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_GET_LANE_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
new file mode 100644
index 0000000000..cc5d8bb2f2
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_GET_LOW_H__
+#define __ARM_COMPUTE_WRAPPER_GET_LOW_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VGETLOW_IMPL(half_vtype, vtype, postfix) \
+    inline half_vtype vgetlow(const vtype val)   \
+    {                                            \
+        return vget_low_##postfix(val);          \
+    }
+
+VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8)
+VGETLOW_IMPL(int8x8_t, int8x16_t, s8)
+VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16)
+VGETLOW_IMPL(int16x4_t, int16x8_t, s16)
+VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32)
+VGETLOW_IMPL(int32x2_t, int32x4_t, s32)
+VGETLOW_IMPL(float32x2_t, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VGETLOW_IMPL(float16x4_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VGETLOW_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_GET_LOW_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 58bfba9645..2e6fd75005 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -28,13 +28,19 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/max.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/min.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/mla.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/movl.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/padd.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
 
 #endif /* __ARM_COMPUTE_WRAPPER_INTRINSICS_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h
index 442d857497..b5d9ed2a35 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/load.h
@@ -45,6 +45,9 @@ VLOAD_IMPL(int32_t, int32x2_t, s32)
 //VLOAD_IMPL(uint64_t, uint64x1_t, u64)
 //VLOAD_IMPL(int64_t, int64x1_t, s64)
 VLOAD_IMPL(float, float32x2_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOAD_IMPL(float16_t, float16x4_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #define VLOADQ_IMPL(stype, vtype, postfix) \
     inline vtype vloadq(const stype *ptr)  \
@@ -61,6 +64,9 @@ VLOADQ_IMPL(int32_t, int32x4_t, s32)
 //VLOAD_IMPL(uint64_t, uint64x1_t, u64)
 //VLOAD_IMPL(int64_t, int64x1_t, s64)
 VLOADQ_IMPL(float, float32x4_t, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOADQ_IMPL(float16_t, float16x8_t, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VLOAD_IMPL
 } // namespace wrapper
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
new file mode 100644
index 0000000000..728fe4e097
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_MOVL_H__
+#define __ARM_COMPUTE_WRAPPER_MOVL_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \
+    inline ptype vmovl(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8)
+VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8)
+VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16)
+VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16)
+VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32)
+VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32)
+
+#undef VMOVL_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_MOVL_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
new file mode 100644
index 0000000000..6ed262edb6
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_MOVN_H__
+#define __ARM_COMPUTE_WRAPPER_MOVN_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vmovn(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64)
+VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64)
+VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32)
+VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32)
+VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16)
+VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16)
+
+#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \
+    inline dtype vqmovn(const vtype &a)            \
+    {                                              \
+        return prefix##_##postfix(a);              \
+    }
+
+VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64)
+VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64)
+VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32)
+VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32)
+VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16)
+VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16)
+
+#undef VMOVN_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_MOVN_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
index c1908fc7b3..932b746965 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
@@ -43,6 +43,9 @@ VMUL_IMPL(int16x4_t, int16x4_t, vmul, s16)
 VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32)
 VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32)
 VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMUL_IMPL(float16_t, float16x4_t, vmul, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8)
 VMUL_IMPL(int8_t, int8x16_t, vmulq, s8)
@@ -51,6 +54,9 @@ VMUL_IMPL(int16_t, int16x8_t, vmulq, s16)
 VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32)
 VMUL_IMPL(int32_t, int32x4_t, vmulq, s32)
 VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMUL_IMPL(float16_t, float16x8_t, vmulq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMUL_IMPL
 } // namespace wrapper
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/padd.h b/arm_compute/core/NEON/wrapper/intrinsics/padd.h
new file mode 100644
index 0000000000..5ee2173df8
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/padd.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_PADD_H__
+#define __ARM_COMPUTE_WRAPPER_PADD_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VPADD_IMPL(stype, vtype, prefix, postfix)      \
+    inline vtype vpadd(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8)
+VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8)
+VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16)
+VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16)
+VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32)
+VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32)
+VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VPADD_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_PADD_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/arm_compute/core/NEON/wrapper/intrinsics/store.h
index be89602c09..35c427902e 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/store.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/store.h
@@ -45,6 +45,9 @@ VSTORE_IMPL(int32_t, int32x2_t, vst1, s32)
 //VSTORE_IMPL(uint64_t, 1, vst1, u64)
 //VSTORE_IMPL(int64_t, 1, vst1, s64)
 VSTORE_IMPL(float, float32x2_t, vst1, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSTORE_IMPL(float16_t, float16x4_t, vst1, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8)
 VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8)
@@ -55,6 +58,9 @@ VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32)
 //VSTORE_IMPL(uint64_t, 2, vst1q, u64)
 //VSTORE_IMPL(int64_t, 2, vst1q, s64)
 VSTORE_IMPL(float, float32x4_t, vst1q, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VSTORE_IMPL
 } // namespace wrapper
-- 
cgit v1.2.1