From 20cfa45faefbf56f62c8b1aa95dfd0b4f52e5641 Mon Sep 17 00:00:00 2001
From: Pablo Marquez Tello <pablo.tello@arm.com>
Date: Mon, 20 Mar 2023 16:29:21 +0000
Subject: Round to nearest with ties to away from zero in Relu

* This patch adds support for rounding modes in vmlaq_qasymm8_signed
  which is used to compute Relu for quantized types

* Partially resolves MLCE-1018

Change-Id: I2a267b84745430e1ffe92b8bc79828a39332db18
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9354
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 src/cpu/kernels/activation/generic/neon/qasymm8.cpp        | 14 +++++++-------
 src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'src/cpu/kernels/activation/generic/neon')

diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index 05a0b505ca..f5555574cb 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -101,21 +101,21 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
                 // Perform activation
                 tmp = vmaxq_u8(vconst_0, vin);
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_u8(va, vmaxq_u8(vb, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
@@ -230,17 +230,17 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
             if(act == ActivationLayerInfo::ActivationFunction::RELU)
             {
                 tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index d7c982e414..d75d0071a2 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,21 +92,21 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
                 // Perform activation
                 tmp = vmaxq_s8(vconst_0, vin);
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 // Perform activation
                 tmp = vminq_s8(va, vmaxq_s8(vb, vin));
                 // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
@@ -214,17 +214,17 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
             if(act == ActivationLayerInfo::ActivationFunction::RELU)
             {
                 tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
             }
             else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
             {
                 tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
             }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
             else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-- 
cgit v1.2.1