diff options
Diffstat (limited to 'src/cpu/kernels/softmax/generic')
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/fp16.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/fp32.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/impl.cpp | 281 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/impl.h | 248 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/qasymm8.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/fp16.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/fp32.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/impl.cpp | 211 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/impl.h | 9 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/qasymm8.cpp | 3 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp | 3 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/impl.cpp | 289 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/impl.h | 9 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp | 12 |
16 files changed, 634 insertions, 515 deletions
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index f6556696b0..2e2adf33e0 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/neon/impl.h" @@ -30,8 +31,13 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); } @@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<float16_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index ddd270ae70..61df40c1b5 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<float>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index f07fd2fb27..5d6e6a4f80 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/kernels/softmax/generic/neon/impl.h" + #include "support/SaturateCast.h" namespace arm_compute @@ -32,11 +33,10 @@ template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *o template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { - static_assert(std::is_same<T, qasymm8_t>::value - || std::is_same<T, qasymm8_signed_t>::value, + static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); const int start_x = in->info()->valid_region().anchor.x(); @@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi Iterator out_it(out, window); constexpr int vec_size = 16; - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); - - float sum{}; - float sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); - /* Init sum to zero */ - float32x4x4_t vec_sum = - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - }; - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + float sum{}; + float sum_inversed{}; - if(is_log) - { - vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); - vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); - vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); - vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); - } - else + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); - vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); - vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); - vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if (is_log) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); } - vst4q_f32(tmp_ptr + x, vec_elements_flt); - } + /* Reduce sum */ + const auto sum_16_byte = + vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); - /* Reduce sum */ - const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); - auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); - sum_res = vpadd_f32(sum_res, sum_res); - sum = wrapper::vgetlane(sum_res, 0); + /* Run remaining elements */ + for (; x < input_width; ++x) + { + float element{}; + if (is_log) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - float element{}; - if(is_log) + tmp_ptr[x] = element; + } + + if (!is_log) { - element = (max_val - in_ptr[x]) * scale_beta; - sum += std::exp(element); + sum_inversed = 256.f / sum; } else { - element = std::exp((max_val - in_ptr[x]) * scale_beta); - sum += element; + sum = std::log(sum); } - - tmp_ptr[x] = element; } - if(!is_log) - { - sum_inversed = 256.f / sum; - } - else + /* Normalize exponentials */ { - sum = std::log(sum); - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - const float32x4x4_t sub = + using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if (is_log) { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), - }; - normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + const float32x4x4_t sub = { + vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + }; + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + } + else + { + float32x4x4_t mul = { + vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + }; + + if (is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - float32x4x4_t mul = + if (is_log) { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), - }; - - if(is_qasymm8_signed) + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + } + else { - const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); - mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); - mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); - mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); - mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - + (is_qasymm8_signed ? 128.f : 0)); } - - normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); - } - else - { - out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 206d36a2e0..4d9b789297 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); const int sum_stages = log2(window_step_x / 2); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output.ptr()); - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); - int x = window_start_x; + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - for(int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; + } - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void neon_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); template <typename T> -void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c constexpr int vec_size = 16 / sizeof(T); const int sum_stages = log2(vec_size / 2); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<T *>(tmp); - - T sum{}; - T sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - - /* Init sum to zero */ - auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<T *>(tmp); - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vsub(vec_elements, vec_max); - if(is_log) - { - vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); - } - else - { - vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); - } - wrapper::vstore(tmp_ptr + x, vec_elements); - } + T sum{}; + T sum_inversed{}; - /* Reduce sum */ - auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); - for(int i = 0; i < sum_stages; ++i) + /* Compute exponentials and sum */ { - sum_res = wrapper::vpadd(sum_res, sum_res); - } - sum = wrapper::vgetlane(sum_res, 0); + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - /* Run remaining elements */ - for(; x < input_width; ++x) - { - T element{}; + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if(is_log) + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - element = (in_ptr[x] - max_val) * beta; - sum += std::exp(element); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if (is_log) + { + vec_elements = + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq( + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(tmp_ptr + x, vec_elements); } - else + + /* Reduce sum */ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for (int i = 0; i < sum_stages; ++i) { - element = std::exp((in_ptr[x] - max_val) * beta); - sum += element; + sum_res = wrapper::vpadd(sum_res, sum_res); } - tmp_ptr[x] = element; - } + sum = wrapper::vgetlane(sum_res, 0); - if(!is_log) - { - sum_inversed = T(1) / sum; - } - else - { - sum = static_cast<T>(std::log(sum)); - } - } + /* Run remaining elements */ + for (; x < input_width; ++x) + { + T element{}; + + if (is_log) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + tmp_ptr[x] = element; + } - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if(is_log) + if (!is_log) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + sum_inversed = T(1) / sum; } else { - normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + sum = static_cast<T>(std::log(sum)); } - wrapper::vstore(out_ptr + x, normalized_value); } - /* Run remaining elements */ - for(; x < input_width; ++x) + + /* Normalize exponentials */ { - if(is_log) + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - out_ptr[x] = tmp_ptr[x] - sum; + auto vec_in = wrapper::vloadq(tmp_ptr + x); + auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + if (is_log) + { + normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + } + else + { + normalized_value = + wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + if (is_log) + { + out_ptr[x] = tmp_ptr[x] - sum; + } + else + { + out_ptr[x] = tmp_ptr[x] * sum_inversed; + } } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index a572891561..40713dc496 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<qasymm8_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 7d3fe6e046..2c5e284f54 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w { return neon_logits_1d_max<qasymm8_signed_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp index 15a523bfc9..5e94f72faf 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp @@ -23,14 +23,20 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); } @@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max<float16_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp index 55c4aee426..d692cc2477 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp @@ -23,14 +23,20 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); } @@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max<float>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 2340a31cbd..24f1bb8143 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/softmax/generic/sve/impl.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute @@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - // Init max value - auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto current_value = svld1(pg, in_ptr + x); - vec_max = svmax_m(pg, vec_max, current_value); + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); - auto max_val = svmaxv(all_true_pg, vec_max); + auto max_val = svmaxv(all_true_pg, vec_max); - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template <typename ScalarType> -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co const auto all_true_pg = wrapper::svptrue<ScalarType>(); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); - - ScalarType sum{ 0 }; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta)); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); - /* Init sum to zero */ - auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + ScalarType sum{0}; - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - do + /* Compute exponentials and sum */ { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); - if(!is_log) + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta)); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do { - vec_elements = wrapper::svexp_z(pg, vec_elements); - vec_sum = svadd_m(pg, vec_sum, vec_elements); + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); + if (!is_log) + { + vec_elements = wrapper::svexp_z(pg, vec_elements); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + if (is_log) + { + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if (is_log) + { + sum = static_cast<ScalarType>(std::log(sum)); } - svst1(pg, tmp_ptr + x, vec_elements); - - if(is_log) + else { - vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + sum = ScalarType(1) / sum; } - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); } - while(svptest_any(all_true_pg, pg)); - /* Reduce sum */ - sum = svaddv(all_true_pg, vec_sum); - - if(is_log) - { - sum = static_cast<ScalarType>(std::log(sum)); - } - else - { - sum = ScalarType(1) / sum; - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - do + /* Normalize exponentials */ { - auto vec_in = svld1(pg, tmp_ptr + x); - auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); - if(is_log) - { - normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); - } - else + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do { - normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); - } - svst1(pg, out_ptr + x, normalized_value); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); + if (is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } while (svptest_any(all_true_pg, pg)); } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); @@ -171,9 +178,19 @@ template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, cons template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); -template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); +template void sve_softmax_logits_1d_float<float>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); +template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h index 4f76ec6a26..89a30d042f 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.h +++ b/src/cpu/kernels/softmax/generic/sve/impl.h @@ -33,8 +33,13 @@ template <typename ScalarType> void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); template <typename ScalarType> -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp index e9044d5fc9..85e5ccfea1 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute @@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max<qasymm8_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp index ab45ce598d..4be2e2eed6 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute @@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi { return sve_logits_1d_max<qasymm8_signed_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 8f677c62d4..98b2f5117f 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -23,7 +23,9 @@ */ #include "src/cpu/kernels/softmax/generic/sve2/impl.h" + #include "arm_compute/core/Types.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -31,8 +33,8 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void sve2_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi const int inc_2 = static_cast<int>(2 * svcntw()); const int inc_3 = static_cast<int>(3 * svcntw()); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); - float sum{}; + float sum{}; - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum_0 = svdup_n_f32(0.f); - auto vec_sum_1 = svdup_n_f32(0.f); - auto vec_sum_2 = svdup_n_f32(0.f); - auto vec_sum_3 = svdup_n_f32(0.f); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do + /* Compute exponentials and sum */ { - const auto vec_elements = svld1(pg, in_ptr + x); - const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum_0 = svdup_n_f32(0.f); + auto vec_sum_1 = svdup_n_f32(0.f); + auto vec_sum_2 = svdup_n_f32(0.f); + auto vec_sum_3 = svdup_n_f32(0.f); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + const auto vec_elements = svld1(pg, in_ptr + x); + const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + + auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); + auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); + auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); + auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); - auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); - auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); - auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); - auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); + if (is_log) + { + vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); + vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); + vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); + vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + } + else + { + vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); + vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); + vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); + vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + } - if(is_log) + svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); + svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); + svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); + svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), + svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); + sum = svaddv_f32(all_true_pg, vec_sum); + + /* Run remaining elements */ + x = 0; + if (is_log) { - vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); - vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); - vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); - vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + sum = std::log(sum); } else { - vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); - vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); - vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); - vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + sum = 256.f / sum; } - - svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); - svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); - svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); - svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); } - while(svptest_any(all_true_pg, pg)); - /* Reduce sum */ - const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); - sum = svaddv_f32(all_true_pg, vec_sum); - - /* Run remaining elements */ - x = 0; - if(is_log) - { - sum = std::log(sum); - } - else + /* Normalize exponentials */ { - sum = 256.f / sum; - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value; - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); - auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); - auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); - auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); - - svfloat32_t res_0{}; - svfloat32_t res_1{}; - svfloat32_t res_2{}; - svfloat32_t res_3{}; - - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do { - res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - } - else - { - res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); + auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); + auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); + auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); + + svfloat32_t res_0{}; + svfloat32_t res_1{}; + svfloat32_t res_2{}; + svfloat32_t res_3{}; - if(is_qasymm8_signed) + if (is_log) { - const auto offset_vec = svdup_n_f32(128.f); - res_0 = svsub_z(pg_0, res_0, offset_vec); - res_1 = svsub_z(pg_1, res_1, offset_vec); - res_2 = svsub_z(pg_2, res_2, offset_vec); - res_3 = svsub_z(pg_3, res_3, offset_vec); + res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + } + else + { + res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + + if (is_qasymm8_signed) + { + const auto offset_vec = svdup_n_f32(128.f); + res_0 = svsub_z(pg_0, res_0, offset_vec); + res_1 = svsub_z(pg_1, res_1, offset_vec); + res_2 = svsub_z(pg_2, res_2, offset_vec); + res_3 = svsub_z(pg_3, res_3, offset_vec); + } } - } - // Store value - const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3); - svst1(pg, out_ptr + x, out); - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); + // Store value + const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3); + svst1(pg, out_ptr + x, out); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h index abbcc15181..33fcc26cda 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.h +++ b/src/cpu/kernels/softmax/generic/sve2/impl.h @@ -31,8 +31,13 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void sve2_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */ diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp index 810035eb9c..95623786b3 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp @@ -23,16 +23,22 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve2/impl.h" namespace arm_compute { namespace cpu { -void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve2_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp index 283b55e9ce..c20462fcef 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp @@ -23,16 +23,22 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve2/impl.h" namespace arm_compute { namespace cpu { -void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve2_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); } -} +} // namespace cpu } // namespace arm_compute |