From 805145dcffff952b6fad390e4092ff6141106cab Mon Sep 17 00:00:00 2001
From: arngra01 <arnaud.grasset@arm.com>
Date: Mon, 4 Jan 2021 14:28:40 +0000
Subject: Add an SVE implementation of the Leaky ReLU activation function for
 qasymm8 and signed qasymm8 data.

Change-Id: I9249e7d4871d473cb5083d2225950faad6056eb4
Signed-off-by: Arnaud Grasset <arnaud.grasset@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4763
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEON/kernels/activation/impl/SVE/qasymm8.cpp   | 75 +++++++++++++++++++++-
 .../kernels/activation/impl/SVE/qasymm8_signed.cpp | 75 +++++++++++++++++++++-
 2 files changed, 148 insertions(+), 2 deletions(-)

(limited to 'src/core')
diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp
index a49a562c84..55133f074c 100644
--- a/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp
+++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,20 @@ void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLa
     auto  vs = svdup_n_f32(s);
     auto  vo = svdup_n_f32(o);
 
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto  voffset_in      = svdup_n_s32(qi_in.offset);
+    int32_t     s_s32           = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t     o_s32           = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto  vs_s32          = svdup_n_s32(s_s32);
+    const auto  vo_s32          = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t     s_leaky_s32     = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t     o_leaky_s32     = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                             arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto  vs_leaky_s32    = svdup_n_s32(s_leaky_s32);
+    const auto  vo_leaky_s32    = svdup_n_s32(o_leaky_s32);
+
     execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
         const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
@@ -164,6 +178,65 @@ void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLa
                 // Re-quantize to new output space
                 tmp = svquantize_z(pg, tmp_dep, qi_out);
             }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                svbool_t p0, p1, p2, p3;
+                svint32x4_t tmp_dep;
+
+                // Expand to int32
+                const svint32x4_t vin_s32 =
+                {
+                    { {
+                            svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                            svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                            svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                            svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
+                    } }
+                };
+
+                // Compare elements to input offset
+                if (qi_in.scale >= 0)
+                {
+                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+                else
+                {
+                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+
+                // Multiply negative elements and requantize if necessary
+                if (requant)
+                {
+                    tmp_dep = svcreate4_s32(
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)
+                    );
+                }
+                else
+                {
+                    tmp_dep = svcreate4_s32(
+                        svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                        svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                        svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                        svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)
+                    );
+                }
+
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
+            }
             else
             {
                 ARM_COMPUTE_ERROR("Unsupported activation function");
diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp
index f34bee88fc..5b010d9453 100644
--- a/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp
+++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,6 +76,20 @@ void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const Activ
     auto  vs = svdup_n_f32(s);
     auto  vo = svdup_n_f32(o);
 
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto  voffset_in      = svdup_n_s32(qi_in.offset);
+    int32_t     s_s32           = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t     o_s32           = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto  vs_s32          = svdup_n_s32(s_s32);
+    const auto  vo_s32          = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t     s_leaky_s32     = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t     o_leaky_s32     = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                             arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto  vs_leaky_s32    = svdup_n_s32(s_leaky_s32);
+    const auto  vo_leaky_s32    = svdup_n_s32(o_leaky_s32);
+
     execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
         const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
@@ -163,6 +177,65 @@ void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const Activ
                 // Re-quantize to new output space
                 tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
             }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                svbool_t p0, p1, p2, p3;
+                svint32x4_t tmp_dep;
+
+                // Expand to int32
+                const svint32x4_t vin_s32 =
+                {
+                    { {
+                            svmovlb_s32(svmovlb_s16(vin)),
+                            svmovlt_s32(svmovlb_s16(vin)),
+                            svmovlb_s32(svmovlt_s16(vin)),
+                            svmovlt_s32(svmovlt_s16(vin)),
+                    } }
+                };
+
+                // Compare elements to input offset
+                if (qi_in.scale >= 0)
+                {
+                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+                else
+                {
+                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+
+                // Multiply negative elements and requantize if necessary
+                if (requant)
+                {
+                    tmp_dep = svcreate4_s32(
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
+                        svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)
+                    );
+                }
+                else
+                {
+                    tmp_dep = svcreate4_s32(
+                        svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                        svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                        svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                        svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)
+                    );
+                }
+
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+            }
             else
             {
                 ARM_COMPUTE_ERROR("Unsupported activation function");
-- 
cgit v1.2.1