diff options
Diffstat (limited to 'src/cpu/kernels/elementwise_unary/generic/neon')
7 files changed, 159 insertions, 132 deletions
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp index b2833c2481..2588db024d 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp @@ -23,17 +23,19 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<__fp16>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp index 6566821eca..936a2e588a 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<float>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h index dbc1dde4fa..d54d3984cb 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" @@ -36,7 +37,7 @@ namespace cpu template <typename ScalarType> inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return 1 / sqrt(a); @@ -60,7 +61,7 @@ inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarTyp template <typename ScalarType, typename VectorType> inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return wrapper::vinvsqrt(a); @@ -94,22 +95,24 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x))); - } - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); - } - }, - input, output); + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); + } + }, + input, output); } template <> @@ -128,75 +131,81 @@ inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - int8x16_t vout; - auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); - const auto vconst_0_f32 = vdupq_n_f32(0); - auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); - - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); + int8x16_t vout; + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto vconst_0_f32 = vdupq_n_f32(0); + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - // Perform activation - float32x4x4_t vtmp_deq = + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) { - { + const auto vin = wrapper::vloadq(input_ptr + x); + + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + + // Perform activation + float32x4x4_t vtmp_deq = {{ elementwise_op_imp<float>(op, vin_deq.val[0]), elementwise_op_imp<float>(op, vin_deq.val[1]), elementwise_op_imp<float>(op, vin_deq.val[2]), elementwise_op_imp<float>(op, vin_deq.val[3]), + }}; + + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); } - }; - if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) - { - vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); - vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); - vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); - vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); + // Re-quantize to new output space + vout = vquantize_signed(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); } - - // Re-quantize to new output space - vout = vquantize_signed(vtmp_deq, qi_out); - wrapper::vstore(output_ptr + x, vout); - } - for(; x < window_end_x; ++x) - { - qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); - qasymm8_signed_t tmp = 0; - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - if(tmp_f <= 0.0) + for (; x < window_end_x; ++x) { - if(op == ElementWiseUnary::LOG) - { - tmp_f = (-128 - qi_out.offset) * qi_out.scale; - } - else if(op == ElementWiseUnary::RSQRT) + qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); + qasymm8_signed_t tmp = 0; + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + if (tmp_f <= 0.0) { - tmp_f = (127 - qi_out.offset) * qi_out.scale; + if (op == ElementWiseUnary::LOG) + { + tmp_f = (-128 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (127 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } } else { tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); } + tmp = quantize_qasymm8_signed( + tmp_f, qi_out, + RoundingPolicy:: + TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. + // For aarch64 LUT is used and rounding to nearest is used + *(output_ptr + x) = tmp; } - else - { - tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); - } - tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. - // For aarch64 LUT is used and rounding to nearest is used - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } template <> inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) @@ -215,71 +224,74 @@ inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Windo Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - uint8x16_t vout; - auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); + uint8x16_t vout; + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; + auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - float32x4x4_t vtmp_deq = - { - { + // Perform activation + float32x4x4_t vtmp_deq = {{ elementwise_op_imp<float>(op, vin_deq.val[0]), elementwise_op_imp<float>(op, vin_deq.val[1]), elementwise_op_imp<float>(op, vin_deq.val[2]), elementwise_op_imp<float>(op, vin_deq.val[3]), + }}; + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); } - }; - if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) - { - vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); - vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); - vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); - vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); - } - // Re-quantize to new output space - vout = vquantize(vtmp_deq, qi_out); - wrapper::vstore(output_ptr + x, vout); - } - for(; x < window_end_x; ++x) - { - qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); - qasymm8_t tmp = 0; - float tmp_f = dequantize_qasymm8(in, qi_in); - if(tmp_f <= 0.0) + // Re-quantize to new output space + vout = vquantize(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); + } + for (; x < window_end_x; ++x) { - if(op == ElementWiseUnary::LOG) + qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); + qasymm8_t tmp = 0; + float tmp_f = dequantize_qasymm8(in, qi_in); + if (tmp_f <= 0.0) { - tmp_f = (0 - qi_out.offset) * qi_out.scale; - } - else if(op == ElementWiseUnary::RSQRT) - { - tmp_f = (255 - qi_out.offset) * qi_out.scale; + if (op == ElementWiseUnary::LOG) + { + tmp_f = (0 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (255 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } } else { tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); } + tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); + *(output_ptr + x) = tmp; } - else - { - tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); - } - tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp index dfe5e30035..d4daad4ca6 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<int32_t>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp index 08bb7f28b6..38cb61d0ff 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -32,24 +33,28 @@ namespace cpu #ifdef __aarch64__ -void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(op); - auto win = window; + auto win = window; const auto window_end_x = window.x().end(); win.set(0, Window::Dimension(0, 1, 1)); Iterator src_it(in, win); Iterator dst_it(out, win); - execute_window_loop(win, [&](const Coordinates &) { - const auto src_ptr = src_it.ptr(); - auto dst_ptr = dst_it.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); - lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); - }, - src_it, dst_it); + lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); } #endif // __aarch64__ diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp index d987f7747b..3e4b88eb47 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Window.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute @@ -31,7 +32,8 @@ namespace cpu { #ifndef __aarch64__ // Fallback function to be used for armv7a, for aarch64 LUT is used -void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_qasymm8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<uint8_t>(in, out, window, op); diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp index e00970a1e0..a5f4b053e3 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Window.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute @@ -31,7 +32,8 @@ namespace cpu { #ifndef __aarch64__ // Fallback function to be used for armv7a, for aarch64 LUT is used -void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_qasymm8_signed_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<int8_t>(in, out, window, op); |