aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/softmax/generic
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/softmax/generic')
-rw-r--r--src/cpu/kernels/softmax/generic/neon/fp16.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/neon/fp32.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.cpp281
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.h248
-rw-r--r--src/cpu/kernels/softmax/generic/neon/qasymm8.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve/fp16.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve/fp32.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve/impl.cpp211
-rw-r--r--src/cpu/kernels/softmax/generic/sve/impl.h9
-rw-r--r--src/cpu/kernels/softmax/generic/sve/qasymm8.cpp3
-rw-r--r--src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp3
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/impl.cpp289
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/impl.h9
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp12
16 files changed, 634 insertions, 515 deletions
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index f6556696b0..2e2adf33e0 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
@@ -30,8 +31,13 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp16_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
{
return neon_logits_1d_max<float16_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index ddd270ae70..61df40c1b5 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -22,14 +22,20 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp32_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
}
@@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
{
return neon_logits_1d_max<float>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index f07fd2fb27..5d6e6a4f80 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
#include "support/SaturateCast.h"
namespace arm_compute
@@ -32,11 +33,10 @@ template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *o
template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_quantized(
+ const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
{
- static_assert(std::is_same<T, qasymm8_t>::value
- || std::is_same<T, qasymm8_signed_t>::value,
+ static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
"quantized type should be either qasymm8_t or qasymm8_signed_t.");
const int start_x = in->info()->valid_region().anchor.x();
@@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
Iterator out_it(out, window);
constexpr int vec_size = 16;
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
- float sum{};
- float sum_inversed{};
-
- /* Compute exponentials and sum */
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<float *>(tmp);
- /* Init sum to zero */
- float32x4x4_t vec_sum =
- {
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- };
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vqsub(vec_max, vec_elements);
- auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+ float sum{};
+ float sum_inversed{};
- if(is_log)
- {
- vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
- vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
- vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
- vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
- }
- else
+ /* Compute exponentials and sum */
+ {
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+ const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+
+ /* Init sum to zero */
+ float32x4x4_t vec_sum = {
+ vdupq_n_f32(0.f),
+ vdupq_n_f32(0.f),
+ vdupq_n_f32(0.f),
+ vdupq_n_f32(0.f),
+ };
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
- vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
- vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
- vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+ auto vec_elements = wrapper::vloadq(in_ptr + x);
+ vec_elements = wrapper::vqsub(vec_max, vec_elements);
+ auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+ if (is_log)
+ {
+ vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+ vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+ vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+ vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+ vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+ vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+ vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+ vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+ }
+ else
+ {
+ vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+ vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+ vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+ vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+ vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+ vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+ vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+ vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+ }
+
+ vst4q_f32(tmp_ptr + x, vec_elements_flt);
}
- vst4q_f32(tmp_ptr + x, vec_elements_flt);
- }
+ /* Reduce sum */
+ const auto sum_16_byte =
+ vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+ auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+ sum_res = vpadd_f32(sum_res, sum_res);
+ sum = wrapper::vgetlane(sum_res, 0);
- /* Reduce sum */
- const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
- auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
- sum_res = vpadd_f32(sum_res, sum_res);
- sum = wrapper::vgetlane(sum_res, 0);
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
+ {
+ float element{};
+ if (is_log)
+ {
+ element = (max_val - in_ptr[x]) * scale_beta;
+ sum += std::exp(element);
+ }
+ else
+ {
+ element = std::exp((max_val - in_ptr[x]) * scale_beta);
+ sum += element;
+ }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- float element{};
- if(is_log)
+ tmp_ptr[x] = element;
+ }
+
+ if (!is_log)
{
- element = (max_val - in_ptr[x]) * scale_beta;
- sum += std::exp(element);
+ sum_inversed = 256.f / sum;
}
else
{
- element = std::exp((max_val - in_ptr[x]) * scale_beta);
- sum += element;
+ sum = std::log(sum);
}
-
- tmp_ptr[x] = element;
}
- if(!is_log)
- {
- sum_inversed = 256.f / sum;
- }
- else
+ /* Normalize exponentials */
{
- sum = std::log(sum);
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
- float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
- int_vec_type normalized_value{};
- if(is_log)
+ constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+ /* Loop over row and compute softmax */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- const float32x4x4_t sub =
+ using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
+ float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+ int_vec_type normalized_value{};
+ if (is_log)
{
- vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
- };
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+ const float32x4x4_t sub = {
+ vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+ };
+ normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+ }
+ else
+ {
+ float32x4x4_t mul = {
+ vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+ };
+
+ if (is_qasymm8_signed)
+ {
+ const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+ mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
+ mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
+ mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
+ mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
+ }
+
+ normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+ }
+ wrapper::vstore(out_ptr + x, normalized_value);
}
- else
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
{
- float32x4x4_t mul =
+ if (is_log)
{
- vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
- };
-
- if(is_qasymm8_signed)
+ out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+ }
+ else
{
- const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
- mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
- mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
- mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
- mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
+ out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+ (is_qasymm8_signed ? 128.f : 0));
}
-
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
- }
- else
- {
- out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
}
}
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 206d36a2e0..4d9b789297 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -25,6 +25,7 @@
#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
const int sum_stages = log2(window_step_x / 2);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ // Get pointers
+ const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output.ptr());
- // Init max value
- auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
- int x = window_start_x;
+ // Init max value
+ auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+ int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto current_value = wrapper::vloadq(in_ptr + x);
- vec_max = wrapper::vmax(vec_max, current_value);
- }
- auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto current_value = wrapper::vloadq(in_ptr + x);
+ vec_max = wrapper::vmax(vec_max, current_value);
+ }
+ auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
- for(int i = 0; i < sum_stages; ++i)
- {
- carry_max = wrapper::vpmax(carry_max, carry_max);
- }
- T max_val = wrapper::vgetlane(carry_max, 0);
+ for (int i = 0; i < sum_stages; ++i)
+ {
+ carry_max = wrapper::vpmax(carry_max, carry_max);
+ }
+ T max_val = wrapper::vgetlane(carry_max, 0);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
- }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+ }
- *out_ptr = max_val;
- },
- input, output);
+ *out_ptr = max_val;
+ },
+ input, output);
}
template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+void neon_softmax_logits_1d_quantized(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_float(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
@@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c
constexpr int vec_size = 16 / sizeof(T);
const int sum_stages = log2(vec_size / 2);
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
- T sum{};
- T sum_inversed{};
-
- /* Compute exponentials and sum */
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
- /* Init sum to zero */
- auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<T *>(tmp);
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vsub(vec_elements, vec_max);
- if(is_log)
- {
- vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
- vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
- }
- else
- {
- vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
- vec_sum = wrapper::vadd(vec_sum, vec_elements);
- }
- wrapper::vstore(tmp_ptr + x, vec_elements);
- }
+ T sum{};
+ T sum_inversed{};
- /* Reduce sum */
- auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
- for(int i = 0; i < sum_stages; ++i)
+ /* Compute exponentials and sum */
{
- sum_res = wrapper::vpadd(sum_res, sum_res);
- }
- sum = wrapper::vgetlane(sum_res, 0);
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+ const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- T element{};
+ /* Init sum to zero */
+ auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- element = (in_ptr[x] - max_val) * beta;
- sum += std::exp(element);
+ auto vec_elements = wrapper::vloadq(in_ptr + x);
+ vec_elements = wrapper::vsub(vec_elements, vec_max);
+ if (is_log)
+ {
+ vec_elements =
+ wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+ vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+ }
+ else
+ {
+ vec_elements = wrapper::vexpq(
+ wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+ vec_sum = wrapper::vadd(vec_sum, vec_elements);
+ }
+ wrapper::vstore(tmp_ptr + x, vec_elements);
}
- else
+
+ /* Reduce sum */
+ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+ for (int i = 0; i < sum_stages; ++i)
{
- element = std::exp((in_ptr[x] - max_val) * beta);
- sum += element;
+ sum_res = wrapper::vpadd(sum_res, sum_res);
}
- tmp_ptr[x] = element;
- }
+ sum = wrapper::vgetlane(sum_res, 0);
- if(!is_log)
- {
- sum_inversed = T(1) / sum;
- }
- else
- {
- sum = static_cast<T>(std::log(sum));
- }
- }
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
+ {
+ T element{};
+
+ if (is_log)
+ {
+ element = (in_ptr[x] - max_val) * beta;
+ sum += std::exp(element);
+ }
+ else
+ {
+ element = std::exp((in_ptr[x] - max_val) * beta);
+ sum += element;
+ }
+ tmp_ptr[x] = element;
+ }
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_in = wrapper::vloadq(tmp_ptr + x);
- auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
+ if (!is_log)
{
- normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+ sum_inversed = T(1) / sum;
}
else
{
- normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+ sum = static_cast<T>(std::log(sum));
}
- wrapper::vstore(out_ptr + x, normalized_value);
}
- /* Run remaining elements */
- for(; x < input_width; ++x)
+
+ /* Normalize exponentials */
{
- if(is_log)
+ /* Loop over row and compute softmax */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- out_ptr[x] = tmp_ptr[x] - sum;
+ auto vec_in = wrapper::vloadq(tmp_ptr + x);
+ auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ if (is_log)
+ {
+ normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+ }
+ else
+ {
+ normalized_value =
+ wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+ }
+ wrapper::vstore(out_ptr + x, normalized_value);
}
- else
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
{
- out_ptr[x] = tmp_ptr[x] * sum_inversed;
+ if (is_log)
+ {
+ out_ptr[x] = tmp_ptr[x] - sum;
+ }
+ else
+ {
+ out_ptr[x] = tmp_ptr[x] * sum_inversed;
+ }
}
}
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index a572891561..40713dc496 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -22,14 +22,20 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
{
return neon_logits_1d_max<qasymm8_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 7d3fe6e046..2c5e284f54 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -22,14 +22,20 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_signed_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w
{
return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
index 15a523bfc9..5e94f72faf 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
@@ -23,14 +23,20 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp16_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
{
return sve_logits_1d_max<float16_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
index 55c4aee426..d692cc2477 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
@@ -23,14 +23,20 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp32_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
}
@@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
{
return sve_logits_1d_max<float>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 2340a31cbd..24f1bb8143 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -23,6 +23,7 @@
*/
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
@@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ // Get pointers
+ const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- // Init max value
- auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+ // Init max value
+ auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto current_value = svld1(pg, in_ptr + x);
- vec_max = svmax_m(pg, vec_max, current_value);
+ int x = window_start_x;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto current_value = svld1(pg, in_ptr + x);
+ vec_max = svmax_m(pg, vec_max, current_value);
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
- auto max_val = svmaxv(all_true_pg, vec_max);
+ auto max_val = svmaxv(all_true_pg, vec_max);
- *out_ptr = max_val;
- },
- input, output);
+ *out_ptr = max_val;
+ },
+ input, output);
}
template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_softmax_logits_1d_float(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
@@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co
const auto all_true_pg = wrapper::svptrue<ScalarType>();
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
- ScalarType sum{ 0 };
-
- /* Compute exponentials and sum */
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- /* Get max value */
- const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
- const auto vec_max = wrapper::svdup_n(max_val);
- const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
- /* Init sum to zero */
- auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+ ScalarType sum{0};
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- do
+ /* Compute exponentials and sum */
{
- auto vec_elements = svld1(pg, in_ptr + x);
- vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
- if(!is_log)
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+ const auto vec_max = wrapper::svdup_n(max_val);
+ const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+
+ /* Init sum to zero */
+ auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ do
{
- vec_elements = wrapper::svexp_z(pg, vec_elements);
- vec_sum = svadd_m(pg, vec_sum, vec_elements);
+ auto vec_elements = svld1(pg, in_ptr + x);
+ vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
+ if (!is_log)
+ {
+ vec_elements = wrapper::svexp_z(pg, vec_elements);
+ vec_sum = svadd_m(pg, vec_sum, vec_elements);
+ }
+ svst1(pg, tmp_ptr + x, vec_elements);
+
+ if (is_log)
+ {
+ vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+ }
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ } while (svptest_any(all_true_pg, pg));
+
+ /* Reduce sum */
+ sum = svaddv(all_true_pg, vec_sum);
+
+ if (is_log)
+ {
+ sum = static_cast<ScalarType>(std::log(sum));
}
- svst1(pg, tmp_ptr + x, vec_elements);
-
- if(is_log)
+ else
{
- vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+ sum = ScalarType(1) / sum;
}
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
}
- while(svptest_any(all_true_pg, pg));
- /* Reduce sum */
- sum = svaddv(all_true_pg, vec_sum);
-
- if(is_log)
- {
- sum = static_cast<ScalarType>(std::log(sum));
- }
- else
- {
- sum = ScalarType(1) / sum;
- }
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- do
+ /* Normalize exponentials */
{
- auto vec_in = svld1(pg, tmp_ptr + x);
- auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
- if(is_log)
- {
- normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
- }
- else
+ /* Loop over row and compute softmax */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ do
{
- normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
- }
- svst1(pg, out_ptr + x, normalized_value);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ auto vec_in = svld1(pg, tmp_ptr + x);
+ auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+ if (is_log)
+ {
+ normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+ }
+ else
+ {
+ normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+ }
+ svst1(pg, out_ptr + x, normalized_value);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ } while (svptest_any(all_true_pg, pg));
}
- while(svptest_any(all_true_pg, pg));
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
@@ -171,9 +178,19 @@ template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, cons
template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
index 4f76ec6a26..89a30d042f 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h
@@ -33,8 +33,13 @@ template <typename ScalarType>
void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
+void sve_softmax_logits_1d_float(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
index e9044d5fc9..85e5ccfea1 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
{
return sve_logits_1d_max<qasymm8_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
index ab45ce598d..4be2e2eed6 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi
{
return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 8f677c62d4..98b2f5117f 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -23,7 +23,9 @@
*/
#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -31,8 +33,8 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window)
+void sve2_softmax_logits_1d_quantized(
+ const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
{
const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
@@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
const int inc_2 = static_cast<int>(2 * svcntw());
const int inc_3 = static_cast<int>(3 * svcntw());
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<float *>(tmp);
- float sum{};
+ float sum{};
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
- const auto vec_max = wrapper::svdup_n(max_val);
-
- /* Init sum to zero */
- auto vec_sum_0 = svdup_n_f32(0.f);
- auto vec_sum_1 = svdup_n_f32(0.f);
- auto vec_sum_2 = svdup_n_f32(0.f);
- auto vec_sum_3 = svdup_n_f32(0.f);
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- svbool_t pg_0 = svunpklo(svunpklo(pg));
- svbool_t pg_1 = svunpkhi(svunpklo(pg));
- svbool_t pg_2 = svunpklo(svunpkhi(pg));
- svbool_t pg_3 = svunpkhi(svunpkhi(pg));
- do
+ /* Compute exponentials and sum */
{
- const auto vec_elements = svld1(pg, in_ptr + x);
- const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+ const auto vec_max = wrapper::svdup_n(max_val);
+
+ /* Init sum to zero */
+ auto vec_sum_0 = svdup_n_f32(0.f);
+ auto vec_sum_1 = svdup_n_f32(0.f);
+ auto vec_sum_2 = svdup_n_f32(0.f);
+ auto vec_sum_3 = svdup_n_f32(0.f);
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ svbool_t pg_0 = svunpklo(svunpklo(pg));
+ svbool_t pg_1 = svunpkhi(svunpklo(pg));
+ svbool_t pg_2 = svunpklo(svunpkhi(pg));
+ svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+ do
+ {
+ const auto vec_elements = svld1(pg, in_ptr + x);
+ const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+
+ auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
+ auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
+ auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
+ auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
- auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
- auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
- auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
- auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
+ if (is_log)
+ {
+ vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+ vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+ vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+ vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+ vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+ vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+ vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+ vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+ }
+ else
+ {
+ vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+ vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+ vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+ vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+ vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+ vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+ vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+ vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+ }
- if(is_log)
+ svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+ svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+ svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+ svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ pg_0 = svunpklo(svunpklo(pg));
+ pg_1 = svunpkhi(svunpklo(pg));
+ pg_2 = svunpklo(svunpkhi(pg));
+ pg_3 = svunpkhi(svunpkhi(pg));
+ } while (svptest_any(all_true_pg, pg));
+
+ /* Reduce sum */
+ const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1),
+ svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+ sum = svaddv_f32(all_true_pg, vec_sum);
+
+ /* Run remaining elements */
+ x = 0;
+ if (is_log)
{
- vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
- vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
- vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
- vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
- vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
- vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
- vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
- vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+ sum = std::log(sum);
}
else
{
- vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
- vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
- vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
- vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
- vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
- vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
- vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
- vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+ sum = 256.f / sum;
}
-
- svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
- svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
- svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
- svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- pg_0 = svunpklo(svunpklo(pg));
- pg_1 = svunpkhi(svunpklo(pg));
- pg_2 = svunpklo(svunpkhi(pg));
- pg_3 = svunpkhi(svunpkhi(pg));
}
- while(svptest_any(all_true_pg, pg));
- /* Reduce sum */
- const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
- sum = svaddv_f32(all_true_pg, vec_sum);
-
- /* Run remaining elements */
- x = 0;
- if(is_log)
- {
- sum = std::log(sum);
- }
- else
+ /* Normalize exponentials */
{
- sum = 256.f / sum;
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- svbool_t pg_0 = svunpklo(svunpklo(pg));
- svbool_t pg_1 = svunpkhi(svunpklo(pg));
- svbool_t pg_2 = svunpklo(svunpkhi(pg));
- svbool_t pg_3 = svunpkhi(svunpkhi(pg));
- do
- {
- auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
- auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
- auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
- auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
- svfloat32_t res_0{};
- svfloat32_t res_1{};
- svfloat32_t res_2{};
- svfloat32_t res_3{};
-
- if(is_log)
+ constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+ /* Loop over row and compute softmax */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ svbool_t pg_0 = svunpklo(svunpklo(pg));
+ svbool_t pg_1 = svunpkhi(svunpklo(pg));
+ svbool_t pg_2 = svunpklo(svunpkhi(pg));
+ svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+ do
{
- res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
- res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
- res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
- res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
- }
- else
- {
- res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
- res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
- res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
- res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+ auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+ auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+ auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+ auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+ svfloat32_t res_0{};
+ svfloat32_t res_1{};
+ svfloat32_t res_2{};
+ svfloat32_t res_3{};
- if(is_qasymm8_signed)
+ if (is_log)
{
- const auto offset_vec = svdup_n_f32(128.f);
- res_0 = svsub_z(pg_0, res_0, offset_vec);
- res_1 = svsub_z(pg_1, res_1, offset_vec);
- res_2 = svsub_z(pg_2, res_2, offset_vec);
- res_3 = svsub_z(pg_3, res_3, offset_vec);
+ res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+ res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+ res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+ res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+ }
+ else
+ {
+ res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+ res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+ res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+ res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+ if (is_qasymm8_signed)
+ {
+ const auto offset_vec = svdup_n_f32(128.f);
+ res_0 = svsub_z(pg_0, res_0, offset_vec);
+ res_1 = svsub_z(pg_1, res_1, offset_vec);
+ res_2 = svsub_z(pg_2, res_2, offset_vec);
+ res_3 = svsub_z(pg_3, res_3, offset_vec);
+ }
}
- }
- // Store value
- const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
- svst1(pg, out_ptr + x, out);
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- pg_0 = svunpklo(svunpklo(pg));
- pg_1 = svunpkhi(svunpklo(pg));
- pg_2 = svunpklo(svunpkhi(pg));
- pg_3 = svunpkhi(svunpkhi(pg));
+ // Store value
+ const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+ svst1(pg, out_ptr + x, out);
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ pg_0 = svunpklo(svunpklo(pg));
+ pg_1 = svunpkhi(svunpklo(pg));
+ pg_2 = svunpklo(svunpkhi(pg));
+ pg_3 = svunpkhi(svunpkhi(pg));
+ } while (svptest_any(all_true_pg, pg));
}
- while(svptest_any(all_true_pg, pg));
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
index abbcc15181..33fcc26cda 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h
@@ -31,8 +31,13 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+void sve2_softmax_logits_1d_quantized(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
#endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
index 810035eb9c..95623786b3 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
@@ -23,16 +23,22 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
index 283b55e9ce..c20462fcef 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
@@ -23,16 +23,22 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_signed_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
}
-}
+} // namespace cpu
} // namespace arm_compute