aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Marquez Tello <pablo.tello@arm.com>2023-08-31 16:00:50 +0100
committerPablo Marquez Tello <pablo.tello@arm.com>2023-09-13 09:16:47 +0000
commit7ce8a83de5b58129f24cf28b293cfd8b3e83880c (patch)
tree8935223a9b7a959486a1acd4af614eba7e1b0e9c
parent145e82e74916a801bd12720564c837c8286042d0 (diff)
downloadComputeLibrary-7ce8a83de5b58129f24cf28b293cfd8b3e83880c.tar.gz
Softmax changes to enable fp16 in armv8a multi_isa builds
* Code guarded with __ARM_FEATURE_FP16_VECTOR_ARITHMETIC needs to be moved to an fp16.cpp file to allow compilation with -march=armv8.2-a+fp16 * fp16.cpp needs to use various templates that had to be moved from impl.cpp to impl.h * Partially resolves MLCE-1102 Change-Id: I2e5e68fbcf5279de1ffc1be4def4f96ed05593e9 Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10224 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.cpp189
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.h177
2 files changed, 175 insertions, 191 deletions
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5654bb52ca..f07fd2fb27 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,68 +22,12 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "support/SaturateCast.h"
namespace arm_compute
{
namespace cpu
{
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(in, win);
- Iterator output(out, win);
-
- const int sum_stages = log2(window_step_x / 2);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
- // Init max value
- auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto current_value = wrapper::vloadq(in_ptr + x);
- vec_max = wrapper::vmax(vec_max, current_value);
- }
- auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
- for(int i = 0; i < sum_stages; ++i)
- {
- carry_max = wrapper::vpmax(carry_max, carry_max);
- }
- T max_val = wrapper::vgetlane(carry_max, 0);
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
- }
-
- *out_ptr = max_val;
- },
- input, output);
-}
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
@@ -264,136 +208,5 @@ template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *
ITensor *out, float beta, bool is_log, const Window &window);
template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
ITensor *out, float beta, bool is_log, const Window &window);
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
-{
- const int start_x = in->info()->valid_region().anchor.x();
- const int input_width = in->info()->valid_region().shape.x();
-
- Iterator in_it(in, window);
- Iterator max_it(max, window);
- Iterator out_it(out, window);
-
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int vec_size = 16 / sizeof(T);
- const int sum_stages = log2(vec_size / 2);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
- T sum{};
- T sum_inversed{};
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
- /* Init sum to zero */
- auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vsub(vec_elements, vec_max);
- if(is_log)
- {
- vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
- vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
- }
- else
- {
- vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
- vec_sum = wrapper::vadd(vec_sum, vec_elements);
- }
- wrapper::vstore(tmp_ptr + x, vec_elements);
- }
-
- /* Reduce sum */
- auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
- for(int i = 0; i < sum_stages; ++i)
- {
- sum_res = wrapper::vpadd(sum_res, sum_res);
- }
- sum = wrapper::vgetlane(sum_res, 0);
-
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- T element{};
-
- if(is_log)
- {
- element = (in_ptr[x] - max_val) * beta;
- sum += std::exp(element);
- }
- else
- {
- element = std::exp((in_ptr[x] - max_val) * beta);
- sum += element;
- }
- tmp_ptr[x] = element;
- }
-
- if(!is_log)
- {
- sum_inversed = T(1) / sum;
- }
- else
- {
- sum = static_cast<T>(std::log(sum));
- }
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_in = wrapper::vloadq(tmp_ptr + x);
- auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
- {
- normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
- }
- else
- {
- normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = tmp_ptr[x] - sum;
- }
- else
- {
- out_ptr[x] = tmp_ptr[x] * sum_inversed;
- }
- }
- }
- },
- in_it, max_it, out_it);
-}
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-template void neon_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 6ca659919a..206d36a2e0 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,62 @@
#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
#include "arm_compute/core/Helpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace cpu
{
template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
+void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
+{
+ /** SIMD vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ constexpr int window_step_x = 16 / sizeof(T);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win{ window };
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Iterator input(in, win);
+ Iterator output(out, win);
+
+ const int sum_stages = log2(window_step_x / 2);
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ // Get pointers
+ const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+
+ // Init max value
+ auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+ int x = window_start_x;
+
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto current_value = wrapper::vloadq(in_ptr + x);
+ vec_max = wrapper::vmax(vec_max, current_value);
+ }
+ auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+ for(int i = 0; i < sum_stages; ++i)
+ {
+ carry_max = wrapper::vpmax(carry_max, carry_max);
+ }
+ T max_val = wrapper::vgetlane(carry_max, 0);
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+ }
+
+ *out_ptr = max_val;
+ },
+ input, output);
+}
template <typename T>
void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
@@ -39,7 +88,129 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
template <typename T>
void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
+ ITensor *out, const float beta, bool is_log, const Window &window)
+{
+ const int start_x = in->info()->valid_region().anchor.x();
+ const int input_width = in->info()->valid_region().shape.x();
+
+ Iterator in_it(in, window);
+ Iterator max_it(max, window);
+ Iterator out_it(out, window);
+
+ /** SIMD vector tag type. */
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+ constexpr int vec_size = 16 / sizeof(T);
+ const int sum_stages = log2(vec_size / 2);
+
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+
+ T sum{};
+ T sum_inversed{};
+
+ /* Compute exponentials and sum */
+ {
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+ const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
+
+ /* Init sum to zero */
+ auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ for(; x <= (input_width - vec_size); x += vec_size)
+ {
+ auto vec_elements = wrapper::vloadq(in_ptr + x);
+ vec_elements = wrapper::vsub(vec_elements, vec_max);
+ if(is_log)
+ {
+ vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+ vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+ }
+ else
+ {
+ vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+ vec_sum = wrapper::vadd(vec_sum, vec_elements);
+ }
+ wrapper::vstore(tmp_ptr + x, vec_elements);
+ }
+
+ /* Reduce sum */
+ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+ for(int i = 0; i < sum_stages; ++i)
+ {
+ sum_res = wrapper::vpadd(sum_res, sum_res);
+ }
+ sum = wrapper::vgetlane(sum_res, 0);
+
+ /* Run remaining elements */
+ for(; x < input_width; ++x)
+ {
+ T element{};
+
+ if(is_log)
+ {
+ element = (in_ptr[x] - max_val) * beta;
+ sum += std::exp(element);
+ }
+ else
+ {
+ element = std::exp((in_ptr[x] - max_val) * beta);
+ sum += element;
+ }
+ tmp_ptr[x] = element;
+ }
+
+ if(!is_log)
+ {
+ sum_inversed = T(1) / sum;
+ }
+ else
+ {
+ sum = static_cast<T>(std::log(sum));
+ }
+ }
+
+ /* Normalize exponentials */
+ {
+ /* Loop over row and compute softmax */
+ int x = 0;
+ for(; x <= (input_width - vec_size); x += vec_size)
+ {
+ auto vec_in = wrapper::vloadq(tmp_ptr + x);
+ auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ if(is_log)
+ {
+ normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+ }
+ else
+ {
+ normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+ }
+ wrapper::vstore(out_ptr + x, normalized_value);
+ }
+ /* Run remaining elements */
+ for(; x < input_width; ++x)
+ {
+ if(is_log)
+ {
+ out_ptr[x] = tmp_ptr[x] - sum;
+ }
+ else
+ {
+ out_ptr[x] = tmp_ptr[x] * sum_inversed;
+ }
+ }
+ }
+ },
+ in_it, max_it, out_it);
+}
} // namespace cpu
} // namespace arm_compute