aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2021-01-06 17:40:30 +0000
committerMichalis Spyrou <michalis.spyrou@arm.com>2021-01-18 14:03:46 +0000
commitb5a450a1acc1149f99f7bb06b10694fba554f4e3 (patch)
treeac6ddf556c1656fd697a53b6ad5e961938f7d85d /src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
parentd2447bb039c268aa21a5ca358cc2d91abe4f4d21 (diff)
downloadComputeLibrary-b5a450a1acc1149f99f7bb06b10694fba554f4e3.tar.gz
Add SVE for Softmax
Implements COMPMID-3875 Change-Id: I38991eed3f4966db125862af066bfedff5994a25 Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4854 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NESoftmaxLayerKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NESoftmaxLayerKernel.cpp576
1 files changed, 149 insertions, 427 deletions
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 97797cefde..fe09f1ec59 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,61 +27,169 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-#include <algorithm>
-#include <arm_neon.h>
-#include <cfloat>
-#include <functional>
+#include "src/core/NEON/kernels/softmax/impl/NEON/list.h"
+#include "src/core/NEON/kernels/softmax/impl/SVE/list.h"
+#include "src/core/common/Registrars.h"
namespace arm_compute
{
-template <typename float_vec_type, typename int_vec_type>
-int_vec_type convert_float_to_int(const float_vec_type &in);
+namespace
+{
+struct SoftmaxSelectorData
+{
+ DataType dt;
+};
+using SoftmaxSelectorPtr = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type;
+using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-template <typename float_vec_type, typename int_vec_type>
-float_vec_type convert_int_to_float(const int_vec_type &in);
+struct SoftmaxLogits1DKernel
+{
+ const char *name;
+ const SoftmaxSelectorPtr is_selected;
+ SoftmaxLogits1DKernelPtr ukernel;
+};
-template <>
-uint8x16_t convert_float_to_int<float32x4x4_t, uint8x16_t>(const float32x4x4_t &in)
+struct SoftmaxLogits1DMaxKernel
{
- uint8x16_t out;
- convert_float32x4x4_to_uint8x16(in, out);
- return out;
-}
+ const char *name;
+ const SoftmaxSelectorPtr is_selected;
+ SoftmaxLogits1DMaxKernelPtr ukernel;
+};
-template <>
-int8x16_t convert_float_to_int<float32x4x4_t, int8x16_t>(const float32x4x4_t &in)
+static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
{
- int8x16_t out;
- convert_float32x4x4_to_int8x16(in, out);
- return out;
-}
+#if defined(__ARM_FEATURE_SVE)
+ {
+ "sve_softmax_logits_1d_float",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
+ },
+ {
+ "sve_softmax_logits_1d_float",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
+ REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
+ },
+#else /* !defined(__ARM_FEATURE_SVE) */
+ {
+ "neon_softmax_logits_1d_float",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
+ },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ {
+ "neon_softmax_logits_1d_float",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
+ },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(__ARM_FEATURE_SVE) */
-template <>
-float32x4x4_t convert_int_to_float<float32x4x4_t, uint8x16_t>(const uint8x16_t &in)
+#if defined(__ARM_FEATURE_SVE2)
+ {
+ "sve_softmax_logits_1d_quantized",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
+ },
+ {
+ "sve_softmax_logits_1d_quantized",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
+ },
+#else /* !defined(__ARM_FEATURE_SVE2) */
+ {
+ "neon_softmax_logits_1d_quantized",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
+ },
+ {
+ "neon_softmax_logits_1d_quantized",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
+ },
+#endif /* defined(__ARM_FEATURE_SVE2) */
+
+};
+
+static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
{
- return convert_uint8x16_to_float32x4x4(in);
-}
+#if defined(__ARM_FEATURE_SVE)
+ {
+ "sve_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
+ },
+ {
+ "sve_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
+ REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
+ },
+ {
+ "sve_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
+ },
+ {
+ "sve_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
+ },
+#else /* !defined(__ARM_FEATURE_SVE) */
+ {
+ "neon_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
+ },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ {
+ "neon_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
+ },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+ {
+ "neon_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
+ },
+ {
+ "neon_logits_1d_max",
+ [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
+ },
+#endif /* defined(__ARM_FEATURE_SVE) */
+};
-template <>
-float32x4x4_t convert_int_to_float<float32x4x4_t, int8x16_t>(const int8x16_t &in)
+const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
{
- return convert_int8x16_to_float32x4x4(in);
+ for(const auto &uk : available_logits_1d_kernels)
+ {
+ if(uk.is_selected({ data.dt }))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
}
-namespace
+const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
{
+ for(const auto &uk : available_logits_1d_max_kernels)
+ {
+ if(uk.is_selected({ data.dt }))
+ {
+ return &uk;
+ }
+ }
+ return nullptr;
+}
+
Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
@@ -98,59 +206,10 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI
return Status{};
}
-template <typename T>
-void logits_1d_max(const ITensor &in, ITensor &out, const Window &window)
-{
- /** NEON vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(&in, win);
- Iterator output(&out, win);
-
- const int sum_stages = log2(window_step_x / 2);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
- // Init max value
- auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto current_value = wrapper::vloadq(in_ptr + x);
- vec_max = wrapper::vmax(vec_max, current_value);
- }
- auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
- for(int i = 0; i < sum_stages; ++i)
- {
- carry_max = wrapper::vpmax(carry_max, carry_max);
- }
- T max_val = wrapper::vgetlane(carry_max, 0);
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
- }
-
- *out_ptr = max_val;
- },
- input, output);
-}
} // namespace
NELogits1DMaxKernel::NELogits1DMaxKernel()
- : _func(nullptr), _border_size()
+ : _border_size()
{
}
@@ -177,26 +236,6 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
coord.set_num_dimensions(output->info()->num_dimensions());
output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- _func = &logits_1d_max<qasymm8_t>;
- break;
- case DataType::QASYMM8_SIGNED:
- _func = &logits_1d_max<qasymm8_signed_t>;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = &logits_1d_max<float16_t>;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- _func = &logits_1d_max<float>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-
_input = input;
_output = output;
@@ -222,9 +261,9 @@ void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(*_input, *_output, window);
+ const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ _input->info()->data_type() });
+ uk->ukernel(_input, _output, window);
}
namespace
@@ -265,308 +304,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor
return Status{};
}
-template <typename T, bool is_log>
-void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window)
-{
- static_assert(std::is_same<T, qasymm8_t>::value
- || std::is_same<T, qasymm8_signed_t>::value,
- "quantized type should be either qasymm8_t or qasymm8_signed_t.");
-
- const int start_x = in.info()->valid_region().anchor.x();
- const int input_width = in.info()->valid_region().shape.x();
-
- const float scale_beta = -beta * in.info()->quantization_info().uniform().scale;
- const auto scale_beta_vec = vdupq_n_f32(scale_beta);
-
- Iterator in_it(&in, window);
- Iterator max_it(&max, window);
- Iterator out_it(&out, window);
- constexpr int vec_size = 16;
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
- float sum{};
- float sum_inversed{};
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
-
- /* Init sum to zero */
- float32x4x4_t vec_sum =
- {
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- };
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vqsub(vec_max, vec_elements);
- auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
-
- if(is_log)
- {
- vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
- vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
- vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
- vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
- }
- else
- {
- vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
- vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
- vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
- vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
- }
-
- vst4q_f32(tmp_ptr + x, vec_elements_flt);
- }
-
- /* Reduce sum */
- const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
- auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
- sum_res = vpadd_f32(sum_res, sum_res);
- sum = wrapper::vgetlane(sum_res, 0);
-
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- float element{};
- if(is_log)
- {
- element = (max_val - in_ptr[x]) * scale_beta;
- sum += std::exp(element);
- }
- else
- {
- element = std::exp((max_val - in_ptr[x]) * scale_beta);
- sum += element;
- }
-
- tmp_ptr[x] = element;
- }
-
- if(!is_log)
- {
- sum_inversed = 256.f / sum;
- }
- else
- {
- sum = std::log(sum);
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
- float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
- int_vec_type normalized_value{};
- if(is_log)
- {
- const float32x4x4_t sub =
- {
- vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
- };
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
- }
- else
- {
- float32x4x4_t mul =
- {
- vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
- };
-
- if(is_qasymm8_signed)
- {
- const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
- mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
- mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
- mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
- mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
- }
-
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
- }
- else
- {
- out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
- }
- }
- }
- },
- in_it, max_it, out_it);
-}
-
-template <typename T, bool is_log = false>
-void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp,
- ITensor &out, const float beta, const Window &window)
-{
- const int start_x = in.info()->valid_region().anchor.x();
- const int input_width = in.info()->valid_region().shape.x();
-
- Iterator in_it(&in, window);
- Iterator max_it(&max, window);
- Iterator out_it(&out, window);
-
- /** NEON vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int vec_size = 16 / sizeof(T);
- const int sum_stages = log2(vec_size / 2);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
- T sum{};
- T sum_inversed{};
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
- /* Init sum to zero */
- auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vsub(vec_elements, vec_max);
- if(is_log)
- {
- vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
- vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
- }
- else
- {
- vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
- vec_sum = wrapper::vadd(vec_sum, vec_elements);
- }
- wrapper::vstore(tmp_ptr + x, vec_elements);
- }
-
- /* Reduce sum */
- auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
- for(int i = 0; i < sum_stages; ++i)
- {
- sum_res = wrapper::vpadd(sum_res, sum_res);
- }
- sum = wrapper::vgetlane(sum_res, 0);
-
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- T element{};
-
- if(is_log)
- {
- element = (in_ptr[x] - max_val) * beta;
- sum += std::exp(element);
- }
- else
- {
- element = std::exp((in_ptr[x] - max_val) * beta);
- sum += element;
- }
- tmp_ptr[x] = element;
- }
-
- if(!is_log)
- {
- sum_inversed = T(1) / sum;
- }
- else
- {
- sum = static_cast<T>(std::log(sum));
- }
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_in = wrapper::vloadq(tmp_ptr + x);
- auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
- {
- normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
- }
- else
- {
- normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = tmp_ptr[x] - sum;
- }
- else
- {
- out_ptr[x] = tmp_ptr[x] * sum_inversed;
- }
- }
- }
- },
- in_it, max_it, out_it);
-}
} // namespace
template <bool IS_LOG>
NELogits1DSoftmaxKernel<IS_LOG>::NELogits1DSoftmaxKernel()
- : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr)
+ : _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr)
{
}
@@ -595,27 +337,6 @@ void NELogits1DSoftmaxKernel<IS_LOG>::configure(const ITensor *input, const ITen
coord.set_num_dimensions(output->info()->num_dimensions());
output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- _func = &logits_1d_softmax_qasymm8<qasymm8_t, IS_LOG>;
- break;
- case DataType::QASYMM8_SIGNED:
- _func = &logits_1d_softmax_qasymm8<qasymm8_signed_t, IS_LOG>;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = &logits_1d_softmax_float<float16_t, IS_LOG>;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- _func = &logits_1d_softmax_float<float, IS_LOG>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- break;
- }
-
_input = input;
_max = max;
_output = output;
@@ -649,7 +370,8 @@ void NELogits1DSoftmaxKernel<IS_LOG>::run(const Window &window, const ThreadInfo
void *tmp_for_thread = _tmp->buffer() + (info.thread_id * tmp_size_for_thread);
- (*_func)(*_input, *_max, tmp_for_thread, *_output, _beta, window);
+ const auto *uk = get_implementation_logits(SoftmaxSelectorData{ _input->info()->data_type() });
+ uk->ukernel(_input, _max, tmp_for_thread, _output, _beta, IS_LOG, window);
}
template class NELogits1DSoftmaxKernel<true>;