aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/l2normlayer
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/l2normlayer')
-rw-r--r--src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp6
-rw-r--r--src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp10
-rw-r--r--src/cpu/kernels/l2normlayer/generic/neon/impl.h96
-rw-r--r--src/cpu/kernels/l2normlayer/list.h5
4 files changed, 64 insertions, 53 deletions
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
index 661c3d7f46..6c6527de06 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
@@ -32,13 +32,15 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp16_l2_normalize_x(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
{
ARM_COMPUTE_UNUSED(unused_axis);
return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window);
}
-void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp16_l2_normalize_yz(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
{
return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis);
}
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
index be32bdc4fa..520877068c 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
@@ -22,21 +22,23 @@
* SOFTWARE.
*/
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
namespace arm_compute
{
namespace cpu
{
-void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp32_l2_normalize_x(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
{
ARM_COMPUTE_UNUSED(unused_axis);
return l2_normalize_x<float, 4>(in, sum, out, epsilon, window);
}
-void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp32_l2_normalize_yz(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
{
return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis);
}
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
index a06cdd33d3..6bd19299b7 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
@@ -26,8 +26,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <cstddef>
@@ -51,33 +52,36 @@ void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float e
Iterator sum_it(sum, win_collapsed);
Iterator output_it(out, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
- const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr());
- const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
- const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
- // Compute elements over vector steps
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- out_ptr[x] = in_ptr[x] * norm_value;
- }
- },
- input_it, sum_it, output_it);
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+ const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr());
+ const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+ const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+ // Compute elements over vector steps
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ out_ptr[x] = in_ptr[x] * norm_value;
+ }
+ },
+ input_it, sum_it, output_it);
}
template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void l2_normalize_yz(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
{
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
@@ -97,28 +101,30 @@ void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float
const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
- const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
- // Compute elements over vector steps
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
- wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
- out_ptr[x] = in_ptr[x] * norm_value;
- }
- },
- input_it, sum_it, output_it);
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+ // Compute elements over vector steps
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+ wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+ out_ptr[x] = in_ptr[x] * norm_value;
+ }
+ },
+ input_it, sum_it, output_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h
index 2bad7f54f5..e2a879d06e 100644
--- a/src/cpu/kernels/l2normlayer/list.h
+++ b/src/cpu/kernels/l2normlayer/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
- void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
+ void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \
+ size_t axis)
DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x);
DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz);