aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/elementwise_unary/generic
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/elementwise_unary/generic')
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/impl.h244
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp21
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp4
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp4
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp44
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp20
12 files changed, 207 insertions, 166 deletions
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
index b2833c2481..2588db024d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -23,17 +23,19 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp16_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<__fp16>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
index 6566821eca..936a2e588a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -22,16 +22,18 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<float>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index dbc1dde4fa..d54d3984cb 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
@@ -36,7 +37,7 @@ namespace cpu
template <typename ScalarType>
inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
return 1 / sqrt(a);
@@ -60,7 +61,7 @@ inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarTyp
template <typename ScalarType, typename VectorType>
inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
return wrapper::vinvsqrt(a);
@@ -94,22 +95,24 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
- }
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
- }
- },
- input, output);
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
+ }
+ },
+ input, output);
}
template <>
@@ -128,75 +131,81 @@ inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int8x16_t vout;
- auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
- const auto vconst_0_f32 = vdupq_n_f32(0);
- auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(input_ptr + x);
-
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
+ int8x16_t vout;
+ auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto vconst_0_f32 = vdupq_n_f32(0);
+ auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
- // Perform activation
- float32x4x4_t vtmp_deq =
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
{
- {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+ // Perform activation
+ float32x4x4_t vtmp_deq = {{
elementwise_op_imp<float>(op, vin_deq.val[0]),
elementwise_op_imp<float>(op, vin_deq.val[1]),
elementwise_op_imp<float>(op, vin_deq.val[2]),
elementwise_op_imp<float>(op, vin_deq.val[3]),
+ }};
+
+ if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+ {
+ vtmp_deq.val[0] =
+ vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+ vtmp_deq.val[1] =
+ vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+ vtmp_deq.val[2] =
+ vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+ vtmp_deq.val[3] =
+ vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
}
- };
- if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
- {
- vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
- vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
- vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
- vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+ // Re-quantize to new output space
+ vout = vquantize_signed(vtmp_deq, qi_out);
+ wrapper::vstore(output_ptr + x, vout);
}
-
- // Re-quantize to new output space
- vout = vquantize_signed(vtmp_deq, qi_out);
- wrapper::vstore(output_ptr + x, vout);
- }
- for(; x < window_end_x; ++x)
- {
- qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
- qasymm8_signed_t tmp = 0;
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- if(tmp_f <= 0.0)
+ for (; x < window_end_x; ++x)
{
- if(op == ElementWiseUnary::LOG)
- {
- tmp_f = (-128 - qi_out.offset) * qi_out.scale;
- }
- else if(op == ElementWiseUnary::RSQRT)
+ qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+ qasymm8_signed_t tmp = 0;
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ if (tmp_f <= 0.0)
{
- tmp_f = (127 - qi_out.offset) * qi_out.scale;
+ if (op == ElementWiseUnary::LOG)
+ {
+ tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+ }
+ else if (op == ElementWiseUnary::RSQRT)
+ {
+ tmp_f = (127 - qi_out.offset) * qi_out.scale;
+ }
+ else
+ {
+ tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+ }
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
+ tmp = quantize_qasymm8_signed(
+ tmp_f, qi_out,
+ RoundingPolicy::
+ TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+ // For aarch64 LUT is used and rounding to nearest is used
+ *(output_ptr + x) = tmp;
}
- else
- {
- tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
- }
- tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
- // For aarch64 LUT is used and rounding to nearest is used
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
template <>
inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
@@ -215,71 +224,74 @@ inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Windo
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- uint8x16_t vout;
- auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
- auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(input_ptr + x);
+ uint8x16_t vout;
+ auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- float32x4x4_t vtmp_deq =
- {
- {
+ // Perform activation
+ float32x4x4_t vtmp_deq = {{
elementwise_op_imp<float>(op, vin_deq.val[0]),
elementwise_op_imp<float>(op, vin_deq.val[1]),
elementwise_op_imp<float>(op, vin_deq.val[2]),
elementwise_op_imp<float>(op, vin_deq.val[3]),
+ }};
+ if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+ {
+ vtmp_deq.val[0] =
+ vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+ vtmp_deq.val[1] =
+ vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+ vtmp_deq.val[2] =
+ vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+ vtmp_deq.val[3] =
+ vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
}
- };
- if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
- {
- vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
- vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
- vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
- vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
- }
- // Re-quantize to new output space
- vout = vquantize(vtmp_deq, qi_out);
- wrapper::vstore(output_ptr + x, vout);
- }
- for(; x < window_end_x; ++x)
- {
- qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
- qasymm8_t tmp = 0;
- float tmp_f = dequantize_qasymm8(in, qi_in);
- if(tmp_f <= 0.0)
+ // Re-quantize to new output space
+ vout = vquantize(vtmp_deq, qi_out);
+ wrapper::vstore(output_ptr + x, vout);
+ }
+ for (; x < window_end_x; ++x)
{
- if(op == ElementWiseUnary::LOG)
+ qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+ qasymm8_t tmp = 0;
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ if (tmp_f <= 0.0)
{
- tmp_f = (0 - qi_out.offset) * qi_out.scale;
- }
- else if(op == ElementWiseUnary::RSQRT)
- {
- tmp_f = (255 - qi_out.offset) * qi_out.scale;
+ if (op == ElementWiseUnary::LOG)
+ {
+ tmp_f = (0 - qi_out.offset) * qi_out.scale;
+ }
+ else if (op == ElementWiseUnary::RSQRT)
+ {
+ tmp_f = (255 - qi_out.offset) * qi_out.scale;
+ }
+ else
+ {
+ tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+ }
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
+ tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+ *(output_ptr + x) = tmp;
}
- else
- {
- tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
- }
- tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
} // namespace cpu
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
index dfe5e30035..d4daad4ca6 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -22,16 +22,18 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_s32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<int32_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
index 08bb7f28b6..38cb61d0ff 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/lut/list.h"
namespace arm_compute
@@ -32,24 +33,28 @@ namespace cpu
#ifdef __aarch64__
-void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_q8_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(op);
- auto win = window;
+ auto win = window;
const auto window_end_x = window.x().end();
win.set(0, Window::Dimension(0, 1, 1));
Iterator src_it(in, win);
Iterator dst_it(out, win);
- execute_window_loop(win, [&](const Coordinates &) {
- const auto src_ptr = src_it.ptr();
- auto dst_ptr = dst_it.ptr();
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto src_ptr = src_it.ptr();
+ auto dst_ptr = dst_it.ptr();
- lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
- },
- src_it, dst_it);
+ lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+ },
+ src_it, dst_it);
}
#endif // __aarch64__
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
index d987f7747b..3e4b88eb47 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Window.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
{
#ifndef __aarch64__
// Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<uint8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
index e00970a1e0..a5f4b053e3 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Window.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
{
#ifndef __aarch64__
// Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_signed_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<int8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
index a883309b2e..22ff43c5d9 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -23,6 +23,7 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
@@ -30,11 +31,12 @@ namespace arm_compute
{
namespace cpu
{
-void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp16_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_sve_op<float16_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
index b21ed8ddbc..394bd47adf 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
@@ -30,10 +31,11 @@ namespace arm_compute
{
namespace cpu
{
-void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_sve_op<float32_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index a948862906..5af534d9e7 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
@@ -31,9 +32,10 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
return svinvsqrt(pg, a);
@@ -55,9 +57,10 @@ inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::val
}
template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::NEG:
return svneg_z(pg, a);
@@ -81,23 +84,24 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto vin = svld1(pg, input_ptr + x);
- svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input, output);
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+ int x = window_start_x;
+
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto vin = svld1(pg, input_ptr + x);
+ svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input, output);
}
template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
index 068c3f7cda..e27fe5a87f 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -23,16 +23,18 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_s32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_sve_op<int32_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
index 7e32f50132..4e4582debb 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
@@ -23,13 +23,15 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/lut/list.h"
namespace arm_compute
{
namespace cpu
{
-void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve2_q8_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(op);
@@ -40,14 +42,16 @@ void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &wi
Iterator src_it(in, win);
Iterator dst_it(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = src_it.ptr();
- auto dst_ptr = dst_it.ptr();
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto src_ptr = src_it.ptr();
+ auto dst_ptr = dst_it.ptr();
- lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
- },
- src_it, dst_it);
+ lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+ },
+ src_it, dst_it);
}
} // namespace cpu