aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/activation/generic/sve2/qasymm8.cpp')
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8.cpp264
1 files changed, 144 insertions, 120 deletions
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index bc9bc7aa3c..7efa9e4b72 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -26,18 +26,21 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
-#include <cmath>
-#include <cstddef>
-
#include "src/core/NEON/SVEAsymm.h"
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -61,7 +64,7 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
// Initialise scale/offset for re-quantization
bool requant = true;
- if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
{
requant = false;
}
@@ -78,139 +81,160 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
const auto vo_s32 = svdup_n_s32(o_s32);
// Initialise scale/offset for re-quantization for leaky relu
- int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
- arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- svuint8_t tmp;
+ svuint8_t tmp;
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto vin = svld1_u8(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = svmax_u8_z(pg, vconst_0, vin);
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
- // Re-quantize to new output space
- tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
{
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- svbool_t p0, p1, p2, p3;
- svint32x4_t tmp_dep;
-
- // Expand to int32
- const svint32x4_t vin_s32 = svcreate4_s32(
- svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
- svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
- svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
- svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
-
- // Compare elements to input offset
- if(qi_in.scale >= 0)
+ const auto vin = svld1_u8(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
{
- p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmax_u8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
}
- else
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
- p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
}
-
- // Multiply negative elements and requantize if necessary
- if(requant)
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ svbool_t p0, p1, p2, p3;
+ svint32x4_t tmp_dep;
+
+ // Expand to int32
+ const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+ svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+ svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+ svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+ // Compare elements to input offset
+ if (qi_in.scale >= 0)
+ {
+ p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+ else
+ {
+ p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+
+ // Multiply negative elements and requantize if necessary
+ if (requant)
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+ svsel(p0, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+ svsel(p1, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+ svsel(p2, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+ svsel(p3, vs_leaky_s32, vs_s32)),
+ 8));
+ }
+ else
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ }
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+ const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
}
else
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ ARM_COMPUTE_ERROR("Unsupported activation function");
}
- // Convert uint32 vectors to uint16 vectors (with saturation)
- const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
- const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
-
- svst1_u8(pg, output_ptr + x, tmp);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
+ svst1_u8(pg, output_ptr + x, tmp);
- }
- while(svptest_any(svptrue_b8(), pg));
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
- },
- input, output);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute