From 20cfa45faefbf56f62c8b1aa95dfd0b4f52e5641 Mon Sep 17 00:00:00 2001
From: Pablo Marquez Tello <pablo.tello@arm.com>
Date: Mon, 20 Mar 2023 16:29:21 +0000
Subject: Round to nearest with ties to away from zero in Relu

* This patch adds support for rounding modes in vmlaq_qasymm8_signed
  which is used to compute Relu for quantized types

* Partially resolves MLCE-1018

Change-Id: I2a267b84745430e1ffe92b8bc79828a39332db18
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9354
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 src/BUILD.bazel                                    |  3 ++
 src/CMakeLists.txt                                 |  3 ++
 src/core/NEON/NEAsymm.h                            |  8 ++-
 src/core/NEON/NEAsymm.inl                          | 60 +++++++++++++++++++++-
 .../kernels/activation/generic/neon/qasymm8.cpp    | 14 ++---
 .../activation/generic/neon/qasymm8_signed.cpp     | 14 ++---
 6 files changed, 85 insertions(+), 17 deletions(-)
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index c88581686a..e2b7609f03 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -242,6 +242,9 @@ filegroup(
 	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp",
+	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp",
+	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp",
+	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp",
 	"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b056421352..ca07ebf029 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -221,6 +221,9 @@ target_sources(
 	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
+	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_1VLx4VL/generic.cpp
+	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_2VLx2VL/generic.cpp
+	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp16fp32fp16_mopa_4VLx1VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
 	core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index 9b92a865d0..5b8d2be04b 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -715,6 +715,12 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua
     const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
     return { pa, pb };
 }
+
+template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
+qasymm8x16_signed_t vmlaq_qasymm8(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
+
+template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
+qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
 } // namespace arm_compute
 #include "src/core/NEON/NEAsymm.inl"
 #endif // ARM_COMPUTE_NEASYMM_H
diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
index 6ee1a336b8..ca2aea1e18 100644
--- a/src/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#include "arm_compute/core/Rounding.h"
+
 namespace arm_compute
 {
+template <RoundingPolicy round_policy>
 inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to uint16 vectors
@@ -46,16 +50,43 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
     // Convert float32 vectors to uint32 vectors
+#if __aarch64__
+    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    {
+        A_u32x4 = vcvtnq_u32_f32(A_f32x4);
+        B_u32x4 = vcvtnq_u32_f32(B_f32x4);
+        C_u32x4 = vcvtnq_u32_f32(C_f32x4);
+        D_u32x4 = vcvtnq_u32_f32(D_f32x4);
+    }
+    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    {
+        A_u32x4 = vcvtaq_u32_f32(A_f32x4);
+        B_u32x4 = vcvtaq_u32_f32(B_f32x4);
+        C_u32x4 = vcvtaq_u32_f32(C_f32x4);
+        D_u32x4 = vcvtaq_u32_f32(D_f32x4);
+    }
+    else
+    {
+        A_u32x4 = vcvtq_u32_f32(A_f32x4);
+        B_u32x4 = vcvtq_u32_f32(B_f32x4);
+        C_u32x4 = vcvtq_u32_f32(C_f32x4);
+        D_u32x4 = vcvtq_u32_f32(D_f32x4);
+    }
+#else  // #if __aarch64__
+    // rounding mode only supported in aarch64
     A_u32x4 = vcvtq_u32_f32(A_f32x4);
     B_u32x4 = vcvtq_u32_f32(B_f32x4);
     C_u32x4 = vcvtq_u32_f32(C_f32x4);
     D_u32x4 = vcvtq_u32_f32(D_f32x4);
+#endif // #if __aarch64__
     // Convert uint32 vectors to uint16 vectors (with saturation)
     vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
     vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
     // convert uint16 vectors to uint8 vectors (with saturation)
     return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
 }
+
+template <RoundingPolicy   round_policy>
 inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to int16 vectors
@@ -78,11 +109,36 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x
     B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
-    // Convert float32 vectors to int32 vectors
+#if __aarch64__
+    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    {
+        A_s32x4 = vcvtnq_s32_f32(A_f32x4);
+        B_s32x4 = vcvtnq_s32_f32(B_f32x4);
+        C_s32x4 = vcvtnq_s32_f32(C_f32x4);
+        D_s32x4 = vcvtnq_s32_f32(D_f32x4);
+    }
+    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    {
+        A_s32x4 = vcvtaq_s32_f32(A_f32x4);
+        B_s32x4 = vcvtaq_s32_f32(B_f32x4);
+        C_s32x4 = vcvtaq_s32_f32(C_f32x4);
+        D_s32x4 = vcvtaq_s32_f32(D_f32x4);
+    }
+    else
+    {
+        A_s32x4 = vcvtq_s32_f32(A_f32x4);
+        B_s32x4 = vcvtq_s32_f32(B_f32x4);
+        C_s32x4 = vcvtq_s32_f32(C_f32x4);
+        D_s32x4 = vcvtq_s32_f32(D_f32x4);
+    }
+#else  // #if __aarch64__
+    // rounding mode only supported in aarch64
     A_s32x4 = vcvtq_s32_f32(A_f32x4);
     B_s32x4 = vcvtq_s32_f32(B_f32x4);
     C_s32x4 = vcvtq_s32_f32(C_f32x4);
     D_s32x4 = vcvtq_s32_f32(D_f32x4);
+#endif // #if __aarch64__
+
     // Convert int32 vectors to int16 vectors (with saturation)
     vd_low_s16x8  = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
     vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index 05a0b505ca..f5555574cb 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -101,21 +101,21 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
                 // Perform activation
                 tmp = vmaxq_u8(vconst_0, vin);
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_u8(va, vmaxq_u8(vb, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
@@ -230,17 +230,17 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
             if(act == ActivationLayerInfo::ActivationFunction::RELU)
             {
                 tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index d7c982e414..d75d0071a2 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,21 +92,21 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
                 // Perform activation
                 tmp = vmaxq_s8(vconst_0, vin);
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_s8(va, vmaxq_s8(vb, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
@@ -214,17 +214,17 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
             if(act == ActivationLayerInfo::ActivationFunction::RELU)
             {
                 tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-- 
cgit v1.2.1