2 files changed, 70 insertions, 4 deletions
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index 523649c65d..ba65926802 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -42,17 +42,23 @@ float32x4_t vfloorq_f32(float32x4_t val);
  *
  * @return The calculated inverse square root.
  */
-float32x4_t vinvsqrtq_f32(float32x4_t x);
+float32x2_t vinvsqrt_f32(float32x2_t x);
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
 /** Calculate inverse square root.
  *
  * @param[in] x Input value.
  *
  * @return The calculated inverse square root.
  */
-float16x8_t vinvsqrtq_f16(float16x8_t x);
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x2_t vinv_f32(float32x2_t x);
 
 /** Calculate reciprocal.
  *
@@ -122,6 +128,31 @@ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
  * @return The calculated Hyperbolic Tangent.
  */
 float16x8_t vtanhq_f16(float16x8_t val);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float16x4_t vinv_f16(float16x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float16x8_t vinvq_f16(float16x8_t x);
+
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float16x4_t vinvsqrt_f16(float16x4_t x);
+
 /** Calculate inverse square root.
  *
  * @param[in] x Input value.
@@ -129,6 +160,7 @@ float16x8_t vtanhq_f16(float16x8_t val);
  * @return The calculated inverse square root.
  */
 float16x8_t vinvsqrtq_f16(float16x8_t x);
+
 /** Calculate exponential
  *
  * @param[in] x Input vector value in F16 format.
@@ -136,6 +168,7 @@ float16x8_t vinvsqrtq_f16(float16x8_t x);
  * @return The calculated exponent.
  */
 float16x8_t vexpq_f16(float16x8_t x);
+
 /** Calculate n power of a number.
  *
  * pow(x,n) = e^(n*log(x))
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index bdd747c4e9..50d85396d4 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -64,6 +64,15 @@ inline float32x4_t vfloorq_f32(float32x4_t val)
     return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r);
 }
 
+inline float32x2_t vinvsqrt_f32(float32x2_t x)
+{
+    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
@@ -73,6 +82,14 @@ inline float32x4_t vinvsqrtq_f32(float32x4_t x)
     return sqrt_reciprocal;
 }
 
+inline float32x2_t vinv_f32(float32x2_t x)
+{
+    float32x2_t recip = vrecpe_f32(x);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    recip             = vmul_f32(vrecps_f32(x, recip), recip);
+    return recip;
+}
+
 inline float32x4_t vinvq_f32(float32x4_t x)
 {
     float32x4_t recip = vrecpeq_f32(x);
@@ -182,6 +199,14 @@ const std::array<float16x8_t, 8> log_tab_f16 =
     }
 };
 
+inline float16x4_t vinvsqrt_f16(float16x4_t x)
+{
+    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    return sqrt_reciprocal;
+}
+
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
     float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
@@ -190,6 +215,14 @@ inline float16x8_t vinvsqrtq_f16(float16x8_t x)
     return sqrt_reciprocal;
 }
 
+inline float16x4_t vinv_f16(float16x4_t x)
+{
+    float16x4_t recip = vrecpe_f16(x);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    recip             = vmul_f16(vrecps_f16(x, recip), recip);
+    return recip;
+}
+
 inline float16x8_t vinvq_f16(float16x8_t x)
 {
     float16x8_t recip = vrecpeq_f16(x);