diff options
Diffstat (limited to 'src/cpu/kernels/softmax/generic')
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/fp16.cpp | 21 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/fp32.cpp | 23 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/impl.cpp | 152 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/impl.h | 210 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/qasymm8.cpp | 22 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp | 23 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/fp16.cpp | 50 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/fp32.cpp | 49 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/impl.cpp | 25 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/qasymm8.cpp | 38 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp | 38 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/impl.cpp | 20 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp | 44 | ||||
-rw-r--r-- | src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp | 44 |
14 files changed, 248 insertions, 511 deletions
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index 2e2adf33e0..db8f881712 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -31,21 +31,18 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); -} -void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) +template <bool IS_LOG> +void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max<float16_t>(in, out, window); + return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window); } + +template void +neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index 61df40c1b5..c281d1bf31 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); -} -void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) +template <bool IS_LOG> +void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max<float>(in, out, window); + return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window); } + +template void +neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index 5d6e6a4f80..487f6ae051 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -29,43 +29,76 @@ namespace arm_compute { namespace cpu { -template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); -template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); - -template <typename T> -void neon_softmax_logits_1d_quantized( - const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) +template <typename T, bool IS_LOG> +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); - const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = vdupq_n_f32(scale_beta); + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator out_it(out, window); - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); constexpr int vec_size = 16; +#ifndef __aarch64__ + const int sum_stages = log2(vec_size >> 1); +#endif // __aarch64__ + + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + float *tmp_ptr = reinterpret_cast<float *>(tmp); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; - float sum{}; - float sum_inversed{}; + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ + + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // Compute Max + + float sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); /* Init sum to zero */ @@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized( int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); - if (is_log) + if (IS_LOG) { vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); @@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized( } /* Reduce sum */ - const auto sum_16_byte = + const float32x4_t sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + + float sum; + +#ifdef __aarch64__ + sum = wrapper::vaddv(sum_16_byte); +#else // __aarch64__ auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); sum_res = vpadd_f32(sum_res, sum_res); sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { float element{}; - if (is_log) + if (IS_LOG) { element = (max_val - in_ptr[x]) * scale_beta; sum += std::exp(element); @@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized( tmp_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = 256.f / sum; + sum_transformed = 256.f / sum; } else { - sum = std::log(sum); + sum_transformed = std::log(sum); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + + const float32x4_t sum_vec = vdupq_n_f32(sum_transformed); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) @@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized( using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); int_vec_type normalized_value{}; - if (is_log) + if (IS_LOG) { const float32x4x4_t sub = { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[0], sum_vec), + vsubq_f32(vec_in.val[1], sum_vec), + vsubq_f32(vec_in.val[2], sum_vec), + vsubq_f32(vec_in.val[3], sum_vec), }; normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); } else { float32x4x4_t mul = { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[0], sum_vec), + vmulq_f32(vec_in.val[1], sum_vec), + vmulq_f32(vec_in.val[2], sum_vec), + vmulq_f32(vec_in.val[3], sum_vec), }; if (is_qasymm8_signed) @@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized( /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed); } else { - out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) - (is_qasymm8_signed ? 128.f : 0)); } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } -template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); +template void neon_softmax_quantized<qasymm8_signed_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_signed_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 4d9b789297..60380cd233 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" @@ -33,105 +33,100 @@ namespace arm_compute { namespace cpu { -template <typename T> -void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Window win{window}; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - const int sum_stages = log2(window_step_x / 2); - execute_window_loop( - win, - [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output.ptr()); - - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); - int x = window_start_x; - - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - - for (int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for (; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } +#ifdef __aarch64__ +namespace +{ +// These helper functions are added because vaddv does not exist for fp16, +// and, therefore, is not part of the wrapper::vaddv interface. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages) +{ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a)); + for (int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + return wrapper::vgetlane(sum_res, 0); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - *out_ptr = max_val; - }, - input, output); +inline float wrapper_vaddv(const float32x4_t &a, int sum_stages) +{ + ARM_COMPUTE_UNUSED(sum_stages); + return wrapper::vaddv(a); } +} // namespace +#endif // __aarch64__ -template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); - -template <typename T> -void neon_softmax_logits_1d_float(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +// The template implementation for float data types is stored in the header file because +// we need all fp16 instantiated code to live in fp16.cpp files. +template <typename T, bool IS_LOG> +void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { - const int start_x = in->info()->valid_region().anchor.x(); + ARM_COMPUTE_UNUSED(tmp); + const int input_width = in->info()->valid_region().shape.x(); Iterator in_it(in, window); - Iterator max_it(max, window); Iterator out_it(out, window); /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - constexpr int vec_size = 16 / sizeof(T); - const int sum_stages = log2(vec_size / 2); + constexpr int vec_size = 16 / sizeof(T); + + const int sum_stages = log2(vec_size >> 1); + + const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}); execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<T *>(tmp); + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; + + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ - T sum{}; - T sum_inversed{}; + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // compute max + + T sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); /* Init sum to zero */ @@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in, { auto vec_elements = wrapper::vloadq(in_ptr + x); vec_elements = wrapper::vsub(vec_elements, vec_max); - if (is_log) + if (IS_LOG) { - vec_elements = - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + vec_elements = wrapper::vmul(vec_elements, beta_vec); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); } else { - vec_elements = wrapper::vexpq( - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec)); + vec_sum = wrapper::vadd(vec_sum, vec_elements); } - wrapper::vstore(tmp_ptr + x, vec_elements); + wrapper::vstore(out_ptr + x, vec_elements); } /* Reduce sum */ + T sum{}; +#ifdef __aarch64__ + sum = wrapper_vaddv(vec_sum, sum_stages); +#else // __aarch64__ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); for (int i = 0; i < sum_stages; ++i) { sum_res = wrapper::vpadd(sum_res, sum_res); } sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { T element{}; - if (is_log) + if (IS_LOG) { element = (in_ptr[x] - max_val) * beta; sum += std::exp(element); @@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in, element = std::exp((in_ptr[x] - max_val) * beta); sum += element; } - tmp_ptr[x] = element; + + out_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = T(1) / sum; + sum_transformed = T(1) / sum; } else { - sum = static_cast<T>(std::log(sum)); + sum_transformed = static_cast<T>(std::log(sum)); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { + const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{}); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if (is_log) + const auto vec_in = wrapper::vloadq(out_ptr + x); + if (IS_LOG) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec)); } else { - normalized_value = - wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec)); } - wrapper::vstore(out_ptr + x, normalized_value); } + /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = tmp_ptr[x] - sum; + out_ptr[x] = out_ptr[x] - sum_transformed; } else { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + out_ptr[x] = out_ptr[x] * sum_transformed; } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } + +template <typename T, bool IS_LOG> +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index 40713dc496..9589ebcd7c 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,16 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template <bool IS_LOG> +void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window); } -void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max<qasymm8_t>(in, out, window); -} +template void +neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 2c5e284f54..0bf6b2859a 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template <bool IS_LOG> +void neon_qasymm8_signed_softmax( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window); } -void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max<qasymm8_signed_t>(in, out, window); -} +template void neon_qasymm8_signed_softmax<true>( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void neon_qasymm8_signed_softmax<false>( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp deleted file mode 100644 index 5e94f72faf..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2021-2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/CpuTypes.h" -#include "src/cpu/kernels/softmax/generic/sve/impl.h" -namespace arm_compute -{ -namespace cpu -{ -void sve_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); -} - -void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<float16_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp deleted file mode 100644 index d692cc2477..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); -} - -void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<float>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 24f1bb8143..0d4b7f4509 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template <typename ScalarType> void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) { @@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in, }, in_it, max_it, out_it); } - -template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); - -template void sve_softmax_logits_1d_float<float>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); -template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp deleted file mode 100644 index 85e5ccfea1..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<qasymm8_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp deleted file mode 100644 index 4be2e2eed6..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<qasymm8_signed_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 98b2f5117f..a8fb1d4adf 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE2 code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template <typename ScalarType> void sve2_softmax_logits_1d_quantized( const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) @@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized( }, in_it, max_it, out_it); } - -template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp deleted file mode 100644 index 95623786b3..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp deleted file mode 100644 index c20462fcef..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute |