From 805145dcffff952b6fad390e4092ff6141106cab Mon Sep 17 00:00:00 2001 From: arngra01 Date: Mon, 4 Jan 2021 14:28:40 +0000 Subject: Add an SVE implementation of the Leaky ReLU activation function for qasymm8 and signed qasymm8 data. Change-Id: I9249e7d4871d473cb5083d2225950faad6056eb4 Signed-off-by: Arnaud Grasset Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4763 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Sang-Hoon Park Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- .../NEON/kernels/activation/impl/SVE/qasymm8.cpp | 75 +++++++++++++++++++++- .../kernels/activation/impl/SVE/qasymm8_signed.cpp | 75 +++++++++++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) (limited to 'src/core') diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp index a49a562c84..55133f074c 100644 --- a/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp +++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -77,6 +77,20 @@ void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLa auto vs = svdup_n_f32(s); auto vo = svdup_n_f32(o); + // Initialise scale/offset for re-quantization with int32_t + const auto voffset_in = svdup_n_s32(qi_in.offset); + int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_s32 = svdup_n_s32(s_s32); + const auto vo_s32 = svdup_n_s32(o_s32); + + // Initialise scale/offset for re-quantization for leaky relu + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); + const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); + execute_window_loop(win_collapsed, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast(input.ptr()); @@ -164,6 +178,65 @@ void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLa // Re-quantize to new output space tmp = svquantize_z(pg, tmp_dep, qi_out); } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = + { + { { + svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))), + } } + }; + + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8) + ); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8) + ); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); + } else { ARM_COMPUTE_ERROR("Unsupported activation function"); diff --git a/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp index f34bee88fc..5b010d9453 100644 --- a/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp +++ b/src/core/NEON/kernels/activation/impl/SVE/qasymm8_signed.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -76,6 +76,20 @@ void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const Activ auto vs = svdup_n_f32(s); auto vo = svdup_n_f32(o); + // Initialise scale/offset for re-quantization with int32_t + const auto voffset_in = svdup_n_s32(qi_in.offset); + int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_s32 = svdup_n_s32(s_s32); + const auto vo_s32 = svdup_n_s32(o_s32); + + // Initialise scale/offset for re-quantization for leaky relu + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); + const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); + execute_window_loop(win_collapsed, [&](const Coordinates &) { const auto input_ptr = reinterpret_cast(input.ptr()); @@ -163,6 +177,65 @@ void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const Activ // Re-quantize to new output space tmp = svquantize_signed_z(pg, tmp_dep, qi_out); } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = + { + { { + svmovlb_s32(svmovlb_s16(vin)), + svmovlt_s32(svmovlb_s16(vin)), + svmovlb_s32(svmovlt_s16(vin)), + svmovlt_s32(svmovlt_s16(vin)), + } } + }; + + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8) + ); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8) + ); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); + } else { ARM_COMPUTE_ERROR("Unsupported activation function"); -- cgit v1.2.1