aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp')
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp306
1 files changed, 174 insertions, 132 deletions
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index d20684f54d..e4667522dd 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -24,20 +24,23 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
#include "src/core/NEON/SVEAsymm.h"
#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -65,7 +68,7 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
// Initialise scale/offset for re-quantization
bool requant = true;
- if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
{
requant = false;
}
@@ -82,151 +85,190 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
const auto vo_s32 = svdup_n_s32(o_s32);
// Initialise scale/offset for re-quantization for leaky relu
- int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
- arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- svint8_t tmp;
+ svint8_t tmp;
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto vin = svld1_s8(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = svmax_s8_z(pg, vconst_0, vin);
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(
- svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))));
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
{
- svbool_t p0, p1, p2, p3;
- svint32x4_t tmp_dep;
-
- // Expand to int32
- const svint32x4_t vin_s32 = svcreate4_s32(
- svmovlb_s32(svmovlb_s16(vin)),
- svmovlt_s32(svmovlb_s16(vin)),
- svmovlb_s32(svmovlt_s16(vin)),
- svmovlt_s32(svmovlt_s16(vin)));
-
- // Compare elements to input offset
- if(qi_in.scale >= 0)
+ const auto vin = svld1_s8(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
{
- p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmax_s8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
}
- else
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
- p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
}
-
- // Multiply negative elements and requantize if necessary
- if(requant)
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
}
- else
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+ const_3_f32))))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
}
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ svbool_t p0, p1, p2, p3;
+ svint32x4_t tmp_dep;
- // Convert uint32 vectors to uint16 vectors (with saturation)
- const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
- const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+ // Expand to int32
+ const svint32x4_t vin_s32 =
+ svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+ svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
+ // Compare elements to input offset
+ if (qi_in.scale >= 0)
+ {
+ p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+ else
+ {
+ p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+
+ // Multiply negative elements and requantize if necessary
+ if (requant)
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+ svsel(p0, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+ svsel(p1, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+ svsel(p2, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+ svsel(p3, vs_leaky_s32, vs_s32)),
+ 8));
+ }
+ else
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ }
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+ const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
- svst1_s8(pg, output_ptr + x, tmp);
+ svst1_s8(pg, output_ptr + x, tmp);
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- input, output);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute