diff options
Diffstat (limited to 'src/cpu/kernels/softmax/generic/neon')
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/fp16.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/fp32.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/impl.cpp | 281 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/impl.h | 248 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/qasymm8.cpp | 12 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp | 12 |
6 files changed, 315 insertions, 262 deletions
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index f6556696b0..2e2adf33e0 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/neon/impl.h" @@ -30,8 +31,13 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); } @@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<float16_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index ddd270ae70..61df40c1b5 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<float>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index f07fd2fb27..5d6e6a4f80 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/kernels/softmax/generic/neon/impl.h" + #include "support/SaturateCast.h" namespace arm_compute @@ -32,11 +33,10 @@ template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *o template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { - static_assert(std::is_same<T, qasymm8_t>::value - || std::is_same<T, qasymm8_signed_t>::value, + static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); const int start_x = in->info()->valid_region().anchor.x(); @@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi Iterator out_it(out, window); constexpr int vec_size = 16; - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); - - float sum{}; - float sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); - /* Init sum to zero */ - float32x4x4_t vec_sum = - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - }; - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + float sum{}; + float sum_inversed{}; - if(is_log) - { - vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); - vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); - vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); - vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); - } - else + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); - vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); - vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); - vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if (is_log) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); } - vst4q_f32(tmp_ptr + x, vec_elements_flt); - } + /* Reduce sum */ + const auto sum_16_byte = + vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); - /* Reduce sum */ - const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); - auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); - sum_res = vpadd_f32(sum_res, sum_res); - sum = wrapper::vgetlane(sum_res, 0); + /* Run remaining elements */ + for (; x < input_width; ++x) + { + float element{}; + if (is_log) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - float element{}; - if(is_log) + tmp_ptr[x] = element; + } + + if (!is_log) { - element = (max_val - in_ptr[x]) * scale_beta; - sum += std::exp(element); + sum_inversed = 256.f / sum; } else { - element = std::exp((max_val - in_ptr[x]) * scale_beta); - sum += element; + sum = std::log(sum); } - - tmp_ptr[x] = element; } - if(!is_log) - { - sum_inversed = 256.f / sum; - } - else + /* Normalize exponentials */ { - sum = std::log(sum); - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - const float32x4x4_t sub = + using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if (is_log) { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), - }; - normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + const float32x4x4_t sub = { + vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + }; + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + } + else + { + float32x4x4_t mul = { + vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + }; + + if (is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - float32x4x4_t mul = + if (is_log) { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), - }; - - if(is_qasymm8_signed) + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + } + else { - const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); - mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); - mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); - mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); - mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - + (is_qasymm8_signed ? 128.f : 0)); } - - normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); - } - else - { - out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 206d36a2e0..4d9b789297 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); const int sum_stages = log2(window_step_x / 2); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output.ptr()); - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); - int x = window_start_x; + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - for(int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; + } - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void neon_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); template <typename T> -void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c constexpr int vec_size = 16 / sizeof(T); const int sum_stages = log2(vec_size / 2); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<T *>(tmp); - - T sum{}; - T sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - - /* Init sum to zero */ - auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<T *>(tmp); - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vsub(vec_elements, vec_max); - if(is_log) - { - vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); - } - else - { - vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); - } - wrapper::vstore(tmp_ptr + x, vec_elements); - } + T sum{}; + T sum_inversed{}; - /* Reduce sum */ - auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); - for(int i = 0; i < sum_stages; ++i) + /* Compute exponentials and sum */ { - sum_res = wrapper::vpadd(sum_res, sum_res); - } - sum = wrapper::vgetlane(sum_res, 0); + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - /* Run remaining elements */ - for(; x < input_width; ++x) - { - T element{}; + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if(is_log) + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - element = (in_ptr[x] - max_val) * beta; - sum += std::exp(element); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if (is_log) + { + vec_elements = + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq( + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(tmp_ptr + x, vec_elements); } - else + + /* Reduce sum */ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for (int i = 0; i < sum_stages; ++i) { - element = std::exp((in_ptr[x] - max_val) * beta); - sum += element; + sum_res = wrapper::vpadd(sum_res, sum_res); } - tmp_ptr[x] = element; - } + sum = wrapper::vgetlane(sum_res, 0); - if(!is_log) - { - sum_inversed = T(1) / sum; - } - else - { - sum = static_cast<T>(std::log(sum)); - } - } + /* Run remaining elements */ + for (; x < input_width; ++x) + { + T element{}; + + if (is_log) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + tmp_ptr[x] = element; + } - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if(is_log) + if (!is_log) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + sum_inversed = T(1) / sum; } else { - normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + sum = static_cast<T>(std::log(sum)); } - wrapper::vstore(out_ptr + x, normalized_value); } - /* Run remaining elements */ - for(; x < input_width; ++x) + + /* Normalize exponentials */ { - if(is_log) + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - out_ptr[x] = tmp_ptr[x] - sum; + auto vec_in = wrapper::vloadq(tmp_ptr + x); + auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + if (is_log) + { + normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + } + else + { + normalized_value = + wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + if (is_log) + { + out_ptr[x] = tmp_ptr[x] - sum; + } + else + { + out_ptr[x] = tmp_ptr[x] * sum_inversed; + } } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index a572891561..40713dc496 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<qasymm8_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 7d3fe6e046..2c5e284f54 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w { return neon_logits_1d_max<qasymm8_signed_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute |