From 35ceeb2199c569810a1524a0a21c2df2a3f5f29e Mon Sep 17 00:00:00 2001 From: Diego Lopez Recas Date: Mon, 4 Dec 2017 18:56:10 +0000 Subject: IVGCVSW-798 Add Softmax NEON support for QASYMM8 Change-Id: I4f2cca52caf210fdb7d6bb7e9436ac51cb5088b4 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112398 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- src/core/AccessWindowAutoPadding.cpp | 4 +- src/core/AccessWindowStatic.cpp | 4 +- src/core/AccessWindowTranspose.cpp | 4 +- src/core/Error.cpp | 4 +- src/core/IAccessWindow.cpp | 27 +- src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 1369 ++++++++++++------------ src/core/SubTensorInfo.cpp | 2 +- src/core/TensorInfo.cpp | 4 +- src/runtime/NEON/functions/NESoftmaxLayer.cpp | 44 +- 9 files changed, 757 insertions(+), 705 deletions(-) (limited to 'src') diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp index b75ebcfeb8..74af99bbb9 100644 --- a/src/core/AccessWindowAutoPadding.cpp +++ b/src/core/AccessWindowAutoPadding.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -70,7 +70,7 @@ bool AccessWindowAutoPadding::update_window_if_needed(Window &window) const return false; } -bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) const +bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) { ARM_COMPUTE_UNUSED(window); diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp index 81ad60bc4c..2ddd59ed4d 100644 --- a/src/core/AccessWindowStatic.cpp +++ b/src/core/AccessWindowStatic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -163,7 +163,7 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const return window_modified; } -bool AccessWindowStatic::update_padding_if_needed(const Window &window) const +bool AccessWindowStatic::update_padding_if_needed(const Window &window) { ARM_COMPUTE_UNUSED(window); diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp index 4506a0b44c..3c45ab3571 100644 --- a/src/core/AccessWindowTranspose.cpp +++ b/src/core/AccessWindowTranspose.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -180,7 +180,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const return window_modified; } -bool AccessWindowTranspose::update_padding_if_needed(const Window &window) const +bool AccessWindowTranspose::update_padding_if_needed(const Window &window) { // Only update the padding if the tensor allows it if(_info == nullptr || !_info->is_resizable()) diff --git a/src/core/Error.cpp b/src/core/Error.cpp index b593c27a7f..2f6a94bb85 100644 --- a/src/core/Error.cpp +++ b/src/core/Error.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -56,7 +56,7 @@ void arm_compute::error(const char *function, const char *file, const int line, va_end(args); throw std::runtime_error(err.error_description()); } -void Status::internal_throw_on_error() +void Status::internal_throw_on_error() const { throw std::runtime_error(_error_description); } diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp index 693d851a5d..7dfe5db5c5 100644 --- a/src/core/IAccessWindow.cpp +++ b/src/core/IAccessWindow.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -122,9 +122,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const if(min_y < front_pad_y_available) { // Not enough padding available, need to shrink the window - const int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y; + int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y; + start = std::min(start / _scale_y, window.y().end()); - window.set(1, Window::Dimension(start / _scale_y, window.y().end(), window.y().step())); + window.set(1, Window::Dimension(start, window.y().end(), window.y().step())); window_modified = true; } @@ -143,8 +144,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const if(static_cast(shape[1]) + tail_pad_y_available < max_y) { // Not enough padding available, need to shrink the window - const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height; - window.set(1, Window::Dimension(window.y().start(), end / _scale_y, window.y().step())); + int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height; + end = std::max(window.y().start(), end / _scale_y); + + window.set(1, Window::Dimension(window.y().start(), end, window.y().step())); window_modified = true; } } @@ -164,8 +167,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const if(min_x < front_pad_x_available) { // Not enough padding available, need to shrink the window - const int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x; - window.set(0, Window::Dimension(start / _scale_x, window.x().end(), window.x().step())); + int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x; + start = std::min(start / _scale_x, window.x().end()); + + window.set(0, Window::Dimension(start, window.x().end(), window.x().step())); window_modified = true; } @@ -181,8 +186,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const if(static_cast(shape[0]) + tail_pad_x_available < max_x) { // Not enough padding available, need to shrink the window - const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width; - window.set(0, Window::Dimension(window.x().start(), end / _scale_x, window.x().step())); + int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width; + end = std::max(window.x().start(), end / _scale_x); + + window.set(0, Window::Dimension(window.x().start(), end, window.x().step())); window_modified = true; } } @@ -192,7 +199,7 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const return window_modified; } -bool AccessWindowRectangle::update_padding_if_needed(const Window &window) const +bool AccessWindowRectangle::update_padding_if_needed(const Window &window) { // Only update the padding if the tensor allows it if(_info == nullptr || !_info->is_resizable()) diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index b13fb0e87c..13d87a0989 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -33,285 +33,433 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/utility.h" #include #include #include +#include -using namespace arm_compute; - -namespace -{ -Status validate_arguments_logits_1d_max(const ITensorInfo *input, const ITensorInfo *output) +namespace arm_compute { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); +template +struct vec_n_type; - // Checks performed when output is configured - if(output->total_size() != 0) - { - // Softmax across the x dimension - TensorShape output_shape{ input->tensor_shape() }; - output_shape.set(0, 1); +#define DECLARE_NEON_VEC_TYPE(T, N, V) \ + template <> \ + struct vec_n_type \ + { \ + using type = V; \ + }; - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - } +DECLARE_NEON_VEC_TYPE(uint8_t, 16, uint8x16_t) +DECLARE_NEON_VEC_TYPE(uint8_t, 8, uint8x8_t) - return Status{}; -} +DECLARE_NEON_VEC_TYPE(int8_t, 16, int8x16_t) +DECLARE_NEON_VEC_TYPE(int8_t, 8, int8x8_t) -std::pair validate_and_configure_window_logits_1d_max(ITensorInfo *input, ITensorInfo *output) -{ - // Configure kernel window - constexpr unsigned int num_elems_written_per_row = 1; - const int input_width = input->valid_region().shape.x(); +DECLARE_NEON_VEC_TYPE(uint16_t, 8, uint16x8_t) +DECLARE_NEON_VEC_TYPE(uint16_t, 4, uint16x4_t) - unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - bool window_changed = false; +DECLARE_NEON_VEC_TYPE(int16_t, 8, int16x8_t) +DECLARE_NEON_VEC_TYPE(int16_t, 4, int16x4_t) - if(output->total_size() != 0) - { - AccessWindowHorizontal output_access(output, 0, num_elems_written_per_row, 1.f / input_width); - window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - else - { - window_changed = update_window_and_padding(win, input_access); - } +DECLARE_NEON_VEC_TYPE(int32_t, 4, int32x4_t) +DECLARE_NEON_VEC_TYPE(int32_t, 2, int32x2_t) - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} +DECLARE_NEON_VEC_TYPE(uint32_t, 4, uint32x4_t) +DECLARE_NEON_VEC_TYPE(uint32_t, 2, uint32x2_t) -Status validate_arguments_logits_1d_shift_exp_sum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, max, sum, output); - ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +DECLARE_NEON_VEC_TYPE(float16_t, 8, float16x8_t) +DECLARE_NEON_VEC_TYPE(float16_t, 4, float16x4_t) +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - // Checks performed when output is configured - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - } +DECLARE_NEON_VEC_TYPE(float, 4, float32x4_t) +DECLARE_NEON_VEC_TYPE(float, 2, float32x2_t) - // Checks performed when sum is configured - if(sum->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, sum); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max, sum); - } +template +using vec_n_t = typename vec_n_type::type; - return Status{}; -} +template +using vec_n_byte_t = vec_n_t < T, N / sizeof(T) >; + +template +using vec_16_byte_t = vec_n_byte_t; + +template +using vec_8_byte_t = vec_n_byte_t; + +template +using const_ptr_t = const T *; -std::pair validate_and_configure_window_logits_1d_shift_exp_sum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum) +template +using ptr_t = T *; + +#define FORWARD_DECLARE_VGET_LANE_FOR_TYPE(TYPE) \ + template \ + TYPE vget_lane(vec_8_byte_t vec); \ + template \ + TYPE vget_lane(vec_16_byte_t vec); + +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint8_t) +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int8_t) +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint16_t) +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int16_t) +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint32_t) +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int32_t) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(float16_t) +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +FORWARD_DECLARE_VGET_LANE_FOR_TYPE(float) +template +float vget_lane(float32x4x4_t vec); + +template +using elem_type_t = decltype(vget_lane<0>(std::declval())); + +template +constexpr size_t vec_size_of(const V &vec) { - unsigned int num_elems_processed_per_iteration = input->valid_region().shape.x(); + return sizeof(vec) / sizeof(elem_type_t); +} - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal max_access(max, 0, 1); - AccessWindowHorizontal sum_access(sum, 0, 1); - bool window_changed = false; +template +V vdup_n(elem_type_t val); +template +V vld(const_ptr_t> ptr); + +#define DECLARE_NEON_FUNCTIONS_FOR_TYPE(TYPE, TAG) \ + template <> \ + inline vec_8_byte_t vdup_n>(TYPE val) \ + { \ + return vdup_n_##TAG(val); \ + } \ + template <> \ + inline vec_16_byte_t vdup_n>(TYPE val) \ + { \ + return vdupq_n_##TAG(val); \ + } \ + template <> \ + inline vec_8_byte_t vld>(const_ptr_t ptr) \ + { \ + return vld1_##TAG(ptr); \ + } \ + template <> \ + inline vec_16_byte_t vld>(const_ptr_t ptr) \ + { \ + return vld1q_##TAG(ptr); \ + } \ + inline void vst(ptr_t ptr, vec_8_byte_t vec) \ + { \ + vst1_##TAG(ptr, vec); \ + } \ + inline void vst(ptr_t ptr, vec_16_byte_t vec) \ + { \ + vst1q_##TAG(ptr, vec); \ + } \ + inline vec_16_byte_t vmax(vec_16_byte_t a, vec_16_byte_t b) \ + { \ + return vmaxq_##TAG(a, b); \ + } \ + inline vec_8_byte_t vpmax(vec_8_byte_t a, vec_8_byte_t b) \ + { \ + return vpmax_##TAG(a, b); \ + } \ + inline vec_8_byte_t vget_low(vec_16_byte_t vec) \ + { \ + return vget_low_##TAG(vec); \ + } \ + inline vec_8_byte_t vget_high(vec_16_byte_t vec) \ + { \ + return vget_high_##TAG(vec); \ + } \ + template \ + inline TYPE vget_lane(vec_8_byte_t vec) \ + { \ + static_assert(lane >= 0, "lane is out of bounds"); \ + static_assert(lane < vec_size_of(vec), "lane is out of bounds"); \ + return vget_lane_##TAG(vec, lane); \ + } \ + template \ + inline TYPE vget_lane(vec_16_byte_t vec) \ + { \ + static_assert(lane >= 0, "lane is out of bounds"); \ + static_assert(lane < vec_size_of(vec), "lane is out of bounds"); \ + return vgetq_lane_##TAG(vec, lane); \ + } - if(output->total_size() != 0) - { - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access); - output_access.set_valid_region(win, input->valid_region()); +template +T sqadd(T a, T b); +template +T sqsub(T a, T b); +template +T sqmul(T a, T b, int fixed_point_position); + +#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU) \ + inline vec_8_byte_t vqsub(vec_8_byte_t a, vec_8_byte_t b) \ + { \ + return vqsub_##TAGT(a, b); \ + } \ + inline vec_8_byte_t vqadd(vec_8_byte_t a, vec_8_byte_t b) \ + { \ + return vqadd_##TAGU(a, b); \ + } \ + inline vec_16_byte_t vqadd(vec_16_byte_t a, vec_16_byte_t b) \ + { \ + return vqaddq_##TAGU(a, b); \ + } \ + inline vec_8_byte_t vqexp(vec_8_byte_t vec, int fixed_point_position) \ + { \ + return vqexp_q##TAGT(vec, fixed_point_position); \ + } \ + inline auto vmovl(vec_8_byte_t vec)->decltype(vmovl_##TAGT(vec)) \ + { \ + return vmovl_##TAGT(vec); \ + } \ + inline vec_16_byte_t vqrecip(vec_16_byte_t vec, int fixed_point_position) \ + { \ + return vqrecipq_q##TAGT(vec, fixed_point_position); \ + } \ + inline vec_16_byte_t vqmul(vec_16_byte_t a, vec_16_byte_t b, int fixed_point_position) \ + { \ + return vqmulq_q##TAGT(a, b, fixed_point_position); \ + } \ + template <> \ + inline TYPEU sqadd(TYPEU a, TYPEU b) \ + { \ + return sqadd_q##TAGU(a, b); \ + } \ + inline TYPET sqexp(TYPET val, int fixed_point_position) \ + { \ + return sqexp_q##TAGT(val, fixed_point_position); \ + } \ + template <> \ + inline TYPET sqsub(TYPET a, TYPET b) \ + { \ + return sqsub_q##TAGT(a, b); \ + } \ + template <> \ + inline TYPET sqmul(TYPET a, TYPET b, int fixed_point_position) \ + { \ + return sqmul_q##TAGT(a, b, fixed_point_position); \ } - else - { - window_changed = update_window_and_padding(win, input_access, max_access, sum_access); + +#define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG) \ + inline vec_8_byte_t vadd(vec_8_byte_t a, vec_8_byte_t b) \ + { \ + return vadd_##TAG(a, b); \ + } \ + inline vec_16_byte_t vadd(vec_16_byte_t a, vec_16_byte_t b) \ + { \ + return vaddq_##TAG(a, b); \ + } \ + inline vec_16_byte_t vsub(vec_16_byte_t a, vec_16_byte_t b) \ + { \ + return vsubq_##TAG(a, b); \ + } \ + inline vec_16_byte_t vexp(vec_16_byte_t vec) \ + { \ + return vexpq_##TAG(vec); \ + } \ + inline vec_16_byte_t vmul_n(vec_16_byte_t vec, TYPE val) \ + { \ + return vmulq_n_##TAG(vec, val); \ } - sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape())); +DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint8_t, u8) +DECLARE_NEON_FUNCTIONS_FOR_TYPE(int8_t, s8) +DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint16_t, u16) +DECLARE_NEON_FUNCTIONS_FOR_TYPE(int16_t, s16) +DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint32_t, u32) +DECLARE_NEON_FUNCTIONS_FOR_TYPE(int32_t, s32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16) +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32) - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} +DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16) +DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32) -Status validate_arguments_logits_1d_norm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16) +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float, f32) - // Checks performed when output is configured - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - } +template +VO vcvt(VI vec); - return Status{}; +template <> +float32x4x4_t vcvt(uint8x16_t vec) +{ + const auto low = vmovl_u8(vget_low(vec)); + const auto high = vmovl_u8(vget_high(vec)); + float32x4x4_t res = { { + vcvtq_f32_u32(vmovl_u16(vget_low(low))), + vcvtq_f32_u32(vmovl_u16(vget_high(low))), + vcvtq_f32_u32(vmovl_u16(vget_low(high))), + vcvtq_f32_u32(vmovl_u16(vget_high(high))) + } + }; + return res; } -std::pair validate_and_configure_window_logits_1d_norm(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output) +template <> +uint8x16_t vcvt(float32x4x4_t vec) { - // Configure kernel window - unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + uint16x8x2_t resU16 = { { + vcombine_u16(vqmovn_u32(vcvtq_u32_f32(vec.val[0])), + vqmovn_u32(vcvtq_u32_f32(vec.val[1]))), + vcombine_u16(vqmovn_u32(vcvtq_u32_f32(vec.val[2])), + vqmovn_u32(vcvtq_u32_f32(vec.val[3]))) + } + }; - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowStatic sum_access(sum, 0, 0, 1, sum->dimension(1)); - bool window_changed = false; + uint8x16_t res = vcombine_u8(vqmovn_u16(resU16.val[0]), vqmovn_u16(resU16.val[1])); + return res; +} - if(output->total_size() != 0) - { - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); +float32x4x4_t vexp(float32x4x4_t vec) +{ + float32x4x4_t res = { { + vexpq_f32(vec.val[0]), + vexpq_f32(vec.val[1]), + vexpq_f32(vec.val[2]), + vexpq_f32(vec.val[3]) + } + }; + return res; +} - window_changed = update_window_and_padding(win, input_access, sum_access, output_access); +template <> +float32x4x4_t vdup_n(float val) +{ + float32x4x4_t res = { { + vdupq_n_f32(val), + vdupq_n_f32(val), + vdupq_n_f32(val), + vdupq_n_f32(val) + } + }; + return res; +} - output_access.set_valid_region(win, input->valid_region()); - } - else - { - window_changed = update_window_and_padding(win, input_access, sum_access); - } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); +float32x4x4_t vmul_n(float32x4x4_t vec, float val) +{ + float32x4x4_t res = { { + vmulq_n_f32(vec.val[0], val), + vmulq_n_f32(vec.val[1], val), + vmulq_n_f32(vec.val[2], val), + vmulq_n_f32(vec.val[3], val) + } + }; + return res; } -void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window) +float32x4x4_t vadd(float32x4x4_t a, float32x4x4_t b) { - Window in_slice = window.first_slice_window_1D(); + float32x4x4_t res = { { + vaddq_f32(a.val[0], b.val[0]), + vaddq_f32(a.val[1], b.val[1]), + vaddq_f32(a.val[2], b.val[2]), + vaddq_f32(a.val[3], b.val[3]) + } + }; + return res; +} - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window max_slice = window_max.first_slice_window_1D(); +namespace +{ +Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) +{ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - do + // Validate in case of configured output + if(output.total_size() != 0) { - Iterator input(in, in_slice); - Iterator output(out, max_slice); - - qint8x16_t vec_max = vdupq_n_s8(std::numeric_limits::lowest()); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const qint8x16_t current_value = vld1q_qs8(in_ptr); - vec_max = vmaxq_qs8(vec_max, current_value); - }, - input); - - qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max)); - carry_max = vpmax_qs8(carry_max, carry_max); - carry_max = vpmax_qs8(carry_max, carry_max); - carry_max = vpmax_qs8(carry_max, carry_max); - - *(reinterpret_cast(output.ptr())) = vget_lane_s8(carry_max, 0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + + return Status{}; } -void logits_1d_max_qs16(const ITensor *in, ITensor *out, const Window &window) + +std::pair validate_and_configure_window_logits_1d_max(ITensorInfo &input, ITensorInfo &output) { - Window in_slice = window.first_slice_window_1D(); + // Softmax across the x dimension + const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1); + // Output auto initialization if not yet initialized + auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info()); - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window max_slice = window_max.first_slice_window_1D(); + // Configure kernel window + const int input_width = input.valid_region().shape.x(); + const int num_elems_processed_per_iteration = 16U / data_size_from_type(input.data_type()); + const int num_elems_read_per_iteration = ceil_to_multiple(input_width, num_elems_processed_per_iteration); - do - { - Iterator input(in, in_slice); - Iterator output(out, max_slice); + const ValidRegion out_valid_region(ValidRegion(input.valid_region()).set(0, 0, 1)); + output.set_valid_region(out_valid_region); - qint16x8_t vec_max = vdupq_n_qs16(std::numeric_limits::lowest()); + Window win = calculate_max_window(output); - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const qint16x8_t current_value = vld1q_qs16(in_ptr); - vec_max = vmaxq_qs16(vec_max, current_value); - }, - input); + AccessWindowHorizontal input_access(&input, input.valid_region().anchor.x(), num_elems_read_per_iteration); + AccessWindowHorizontal output_access(&output, 0, 1); - qint16x4_t carry_max = vpmax_qs16(vget_high_qs16(vec_max), vget_low_qs16(vec_max)); - carry_max = vpmax_qs16(carry_max, carry_max); - carry_max = vpmax_qs16(carry_max, carry_max); + const bool window_changed = update_window_and_padding(win, input_access, output_access); - *(reinterpret_cast(output.ptr())) = vget_lane_s16(carry_max, 0); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + const Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void logits_1d_max_f16(const ITensor *in, ITensor *out, const Window &window) +template +auto reduce_max(V vec) -> elem_type_t { - Window in_slice = window.first_slice_window_1D(); + constexpr int N = vec_size_of(vec); - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window max_slice = window_max.first_slice_window_1D(); + auto carry_max = vpmax(vget_high(vec), vget_low(vec)); - do + for(int k = N / 2; k > 1; k /= 2) { - Iterator input(in, in_slice); - Iterator output(out, max_slice); - - float16x8_t vec_max = vdupq_n_f16(std::numeric_limits::lowest()); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const float16x8_t current_value = vld1q_f16(in_ptr); - vec_max = vmaxq_f16(vec_max, current_value); - }, - input); - - float16x4_t carry_max = vpmax_f16(vget_high_f16(vec_max), vget_low_f16(vec_max)); - carry_max = vpmax_f16(carry_max, carry_max); - carry_max = vpmax_f16(carry_max, carry_max); - - *(reinterpret_cast(output.ptr())) = vget_lane_f16(carry_max, 0); + carry_max = vpmax(carry_max, carry_max); } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + + return vget_lane<0>(carry_max); } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window) +template +void logits_1d_max(const ITensor &in, ITensor &out, const Window &window) { - Window in_slice = window.first_slice_window_1D(); + const auto start_x = in.info()->valid_region().anchor.x(); + const size_t input_width = in.info()->valid_region().shape.x(); - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window max_slice = window_max.first_slice_window_1D(); + Iterator input(&in, window); + Iterator output(&out, window); - do + execute_window_loop(window, [&](const Coordinates &) { - Iterator input(in, in_slice); - Iterator output(out, max_slice); + // Get pointers + const auto in_ptr = reinterpret_cast(input.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(output.ptr()); - float32x4_t vec_max = vdupq_n_f32(-FLT_MAX); + // Init max value + auto vec_max = vdup_n>(std::numeric_limits::lowest()); - execute_window_loop(in_slice, [&](const Coordinates & id) + // Loop over input row + for(const T *it = in_ptr; it < (in_ptr + input_width); it += vec_size_of(vec_max)) { - const auto in_ptr = reinterpret_cast(input.ptr()); - const float32x4_t current_value = vld1q_f32(in_ptr); - vec_max = vmaxq_f32(vec_max, current_value); - }, - input); - - float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max)); - carry_max = vpmax_f32(carry_max, carry_max); + const auto current_value = vld>(it); + vec_max = vmax(vec_max, current_value); + } - *(reinterpret_cast(output.ptr())) = vget_lane_f32(carry_max, 0); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + const T max_val = reduce_max(vec_max); + *out_ptr = max_val; + }, + input, output); } } // namespace @@ -328,54 +476,54 @@ BorderSize NELogits1DMaxKernel::border_size() const void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - // Softmax across the x dimension - TensorShape output_shape{ input->info()->tensor_shape() }; - output_shape.set(0, 1); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); - + ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info()); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(input->info(), output->info())); - - const int input_width = input->info()->valid_region().shape.x(); - unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*input->info(), *output->info())); + // Configure kernel window + auto win_config = validate_and_configure_window_logits_1d_max(*input->info(), *output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); switch(input->info()->data_type()) { + case DataType::QASYMM8: + _func = &logits_1d_max; + break; case DataType::QS8: - _func = &logits_1d_max_qs8; + _func = &logits_1d_max; break; case DataType::QS16: - _func = &logits_1d_max_qs16; + _func = &logits_1d_max; break; - case DataType::F32: - _func = &logits_1d_max_f32; - break; - case DataType::F16: #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - _func = &logits_1d_max_f16; + case DataType::F16: + _func = &logits_1d_max; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + _func = &logits_1d_max; + break; default: ARM_COMPUTE_ERROR("Unsupported data type."); } - _input = input; - _output = output; - _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0); + _input = input; + _output = output; + + const int input_width = input->info()->valid_region().shape.x(); + const int num_elems_processed_per_iteration = 16U / data_size_from_type(input->info()->data_type()); + const int num_elems_read_per_iteration = ceil_to_multiple(input_width, num_elems_processed_per_iteration); + + _border_size = BorderSize(0, num_elems_read_per_iteration - input_width, 0, 0); - // Configure kernel window - auto win_config = validate_and_configure_window_logits_1d_max(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } Status NELogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*input, *output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(*input->clone(), *output->clone()).first); return Status{}; } @@ -387,297 +535,393 @@ void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input, _output, window); + (*_func)(*_input, *_output, window); } namespace { -void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta) +Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max, + const ITensorInfo &output, const float beta, const ITensorInfo &tmp) { - ARM_COMPUTE_UNUSED(beta); + // Check input +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type()); - Window max_slice = window_max.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); + // Check max + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max); - constexpr int step = 8; - const int long_steps = in->info()->valid_region().shape.x() / step; - const int small_steps = in->info()->valid_region().shape.x() % step; - const int fixed_point_position = in->info()->fixed_point_position(); + // Check output if configured + if(output.total_size() != 0) + { + const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization); + } + + // Check beta + ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type())); - do + // Check tmp if configured + if(tmp.total_size() != 0) { - Iterator input(in, in_slice); - Iterator exp(out, in_slice); - Iterator _max(max, max_slice); - Iterator _sum(sum, max_slice); + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type(); + ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp); + // We could potentially reduce tmp memory if we could predict or make an assumption + // on the maximum number of threads that will run in parallel. + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp); + } - // Get pointers - auto in_ptr = reinterpret_cast(input.ptr()); - auto exp_ptr = reinterpret_cast(exp.ptr()); + return Status{}; +} - // Init sum to zero - qint16x8_t vec_sum_value = vdupq_n_qs16(0); +std::pair validate_and_configure_window_logits_softmax(ITensorInfo &input, ITensorInfo &max, + ITensorInfo &output, ITensorInfo &tmp) +{ + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type()); - // Get max value - const auto max_ptr = reinterpret_cast(_max.ptr()); - const qint8x8_t vec_max = vdup_n_qs8(*max_ptr); + // Output auto initialization if not yet initialized + const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info(); + auto_init_if_empty(output, TensorInfo(input).set_quantization_info(output_quantization).reset_padding()); - // Run neon loop - for(int i = 0; i < long_steps; ++i) - { - qint8x8_t vec_elements = vld1_qs8(in_ptr); - vec_elements = vqsub_qs8(vec_elements, vec_max); - vec_elements = vqexp_qs8(vec_elements, fixed_point_position); + // Tmp auto initialization if not yet initialized + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type(); + auto_init_if_empty(tmp, TensorInfo(input).set_data_type(tmp_data_type).reset_padding()); - vst1_qs8(exp_ptr, vec_elements); - vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements)); + const int input_width = input.valid_region().shape.x(); - in_ptr += step; - exp_ptr += step; - } - // Reduce sum - const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value)); - const qint16_t sum0 = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1)); - const qint16_t sum1 = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3)); - qint16_t sum = sqadd_qs16(sum0, sum1); - - // Run remaining elements - for(int i = 0; i < small_steps; ++i) - { - qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position); - exp_ptr[i] = element; - sum = sqadd_qs16(sum, element); - } + Window win = calculate_max_window(max); - *(reinterpret_cast(_sum.ptr())) = sqmovn_qs16(sum); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); -} -void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta) -{ - ARM_COMPUTE_UNUSED(beta); + AccessWindowHorizontal input_access(&input, input.valid_region().anchor.x(), input_width); + AccessWindowHorizontal max_access(&input, 0, 1); + AccessWindowHorizontal output_access(&output, input.valid_region().anchor.x(), input_width); + AccessWindowHorizontal tmp_access(&tmp, input.valid_region().anchor.x(), input_width); - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + const bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, tmp_access); - Window max_slice = window_max.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); + output.set_valid_region(input.valid_region()); - constexpr int step = 4; - const int long_steps = in->info()->valid_region().shape.x() / step; - const int small_steps = in->info()->valid_region().shape.x() % step; - const int fixed_point_position = in->info()->fixed_point_position(); + const Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} - do +template +struct reduce_add_impl +{ + template + static T reduce(F add_fn, vec_n_t vec) + { + constexpr int H = (S + E + 1) / 2; + const auto reduced_high = reduce_add_impl < T, N, S, H - 1 >::reduce(add_fn, vec); + const auto reduced_low = reduce_add_impl::reduce(add_fn, vec); + return add_fn(reduced_high, reduced_low); + } +}; +template +struct reduce_add_impl +{ + template + static T reduce(F /*add_fn*/, vec_n_t vec) { - Iterator input(in, in_slice); - Iterator exp(out, in_slice); - Iterator _max(max, max_slice); - Iterator _sum(sum, max_slice); + return vget_lane(vec); + } +}; +template +elem_type_t reduce_add(F add_fn, V vec) +{ + constexpr int N = vec_size_of(vec); + return reduce_add_impl < elem_type_t, N, 0, N - 1 >::reduce(add_fn, vec); +} - // Get pointers - auto in_ptr = reinterpret_cast(input.ptr()); - auto exp_ptr = reinterpret_cast(exp.ptr()); +void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window) +{ + const int start_x = in.info()->valid_region().anchor.x(); + const int input_width = in.info()->valid_region().shape.x(); - // Init sum to zero - qint32x4_t vec_sum_value = vdupq_n_qs32(0); + const float scale_beta = -beta * in.info()->quantization_info().scale; - // Get max value - const auto max_ptr = reinterpret_cast(_max.ptr()); - const qint16x4_t vec_max = vdup_n_qs16(*max_ptr); + Iterator in_it(&in, window); + Iterator max_it(&max, window); + Iterator out_it(&out, window); - // Run neon loop - for(int i = 0; i < long_steps; ++i) - { - qint16x4_t vec_elements = vld1_qs16(in_ptr); - vec_elements = vqsub_qs16(vec_elements, vec_max); - vec_elements = vqexp_qs16(vec_elements, fixed_point_position); + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - vst1_qs16(exp_ptr, vec_elements); - vec_sum_value = vqaddq_qs32(vec_sum_value, vmovl_s16(vec_elements)); + float sum_inversed; - in_ptr += step; - exp_ptr += step; + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = vdup_n>(max_val); + + /* Init sum to zero */ + auto vec_sum = vdup_n(0.f); + + /* Loop over row and compute exponentials and sum */ + int i = 0; + constexpr int vec_size = vec_size_of(vec_max); + for(; i <= (input_width - vec_size); i += vec_size) + { + auto vec_elements = vld>(in_ptr + i); + vec_elements = vsubq_u8(vec_max, vec_elements); + + auto vec_elements_flt = vcvt(vec_elements); + vec_elements_flt = vexp(vmul_n(vec_elements_flt, scale_beta)); + + vec_sum = vadd(vec_sum, vec_elements_flt); + + vst4q_f32(tmp_ptr + i, vec_elements_flt); + } + /* Reduce sum */ + const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), + vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + const auto sum_8_byte = vadd_f32(vget_low(sum_16_byte), vget_high(sum_16_byte)); + float sum = reduce_add(std::plus(), sum_8_byte); + + /* Run remaining elements */ + for(; i < input_width; ++i) + { + const float element = std::exp((max_val - in_ptr[i]) * scale_beta); + sum += element; + tmp_ptr[i] = element; + } + + sum_inversed = 256.f / sum; } - // Reduce sum - qint32x2_t carry_addition = vqadd_qs32(vget_high_s32(vec_sum_value), vget_low_s32(vec_sum_value)); - qint32_t sum = vget_lane_s32(carry_addition, 0) + vget_lane_s32(carry_addition, 1); - // Run remaining elements - for(int i = 0; i < small_steps; ++i) + /* Normalize exponentials */ { - qint16_t element = sqexp_qs16(sqsub_qs16(in_ptr[i], *max_ptr), fixed_point_position); - exp_ptr[i] = element; - sum = sqadd_qs32(sum, element); + /* Loop over row and compute softmax */ + int i = 0; + { + constexpr int vec_size = 16; + for(; i <= (input_width - vec_size); i += vec_size) + { + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + i); + auto normalized_value = vcvt>(vmul_n(vec_in, sum_inversed)); + vst(out_ptr + i, normalized_value); + } + } + /* Run remaining elements */ + for(; i < input_width; ++i) + { + out_ptr[i] = utility::saturate_cast(tmp_ptr[i] * sum_inversed); + } } - - *(reinterpret_cast(_sum.ptr())) = sqmovn_qs32(sum); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + }, + in_it, max_it, out_it); } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta) +template +void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp, + ITensor &out, const float /*beta*/, const Window &window) { - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + const int start_x = in.info()->valid_region().anchor.x(); + const int input_width = in.info()->valid_region().shape.x(); - Window max_slice = window_max.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); + const int fixed_point_position = in.info()->fixed_point_position(); - constexpr int step = 8; - const int long_steps = in->info()->valid_region().shape.x() / step; - const int small_steps = in->info()->valid_region().shape.x() % step; + Iterator in_it(&in, window); + Iterator max_it(&max, window); + Iterator out_it(&out, window); - do + execute_window_loop(window, [&](const Coordinates &) { - Iterator input(in, in_slice); - Iterator exp(out, in_slice); - Iterator _max(max, max_slice); - Iterator _sum(sum, max_slice); - - // Get pointers - auto in_ptr = reinterpret_cast(input.ptr()); - auto exp_ptr = reinterpret_cast(exp.ptr()); + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - // Init sum to zero - float16x8_t vec_sum_value = vdupq_n_f16(0); + vec_16_byte_t vec_sum_inversed; - // Get max value - const auto max_ptr = reinterpret_cast(_max.ptr()); - const float16x8_t vec_max = vdupq_n_f16(*max_ptr); - - // Run neon loop - for(int i = 0; i < long_steps; ++i) + /* Compute exponentials and sum */ { - float16x8_t vec_elements = vld1q_f16(in_ptr); - vec_elements = vsubq_f16(vec_elements, vec_max); - vec_elements = vmulq_n_f16(vec_elements, beta); - vec_elements = vexpq_f16(vec_elements); - - vst1q_f16(exp_ptr, vec_elements); - vec_sum_value = vaddq_f16(vec_sum_value, vec_elements); - - in_ptr += step; - exp_ptr += step; + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = vdup_n>(max_val); + + /* Init sum to zero */ + auto vec_sum = vdup_n>(0); + + /* Loop over row and compute exponentials and sum */ + int i = 0; + constexpr int vec_size = vec_size_of(vec_sum); + for(; i <= (input_width - vec_size); i += vec_size) + { + auto vec_elements = vld>(in_ptr + i); + vec_elements = vqsub(vec_elements, vec_max); + vec_elements = vqexp(vec_elements, fixed_point_position); + vec_sum = vqadd(vec_sum, vmovl(vec_elements)); + vst(tmp_ptr + i, vec_elements); + } + /* Reduce sum */ + const vec_8_byte_t sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum)); + U sum = reduce_add(sqadd, sum_8_byte); + + /* Run remaining elements */ + for(; i < input_width; ++i) + { + T element = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position); + sum = sqadd(sum, element); + tmp_ptr[i] = element; + } + + const auto qsum = utility::saturate_cast(sum); + vec_sum_inversed = vqrecip(vdup_n>(qsum), fixed_point_position); } - // Reduce sum - const float16x4_t sum_red = vadd_f16(vget_low_f16(vec_sum_value), vget_high_f16(vec_sum_value)); - const float16x4_t carry_addition = vpadd_f16(sum_red, sum_red); - float16_t sum = vget_lane_f16(carry_addition, 0) + vget_lane_f16(carry_addition, 1); - // Run remaining elements - for(int i = 0; i < small_steps; ++i) + /* Normalize exponentials */ { - const float16_t element = std::exp(static_cast(in_ptr[i] - *max_ptr) * beta); - exp_ptr[i] = element; - sum += element; + /* Loop over row and compute softmax */ + int i = 0; + constexpr int vec_size = vec_size_of(vec_sum_inversed); + for(; i <= (input_width - vec_size); i += vec_size) + { + const auto vec_in = vld>(tmp_ptr + i); + const vec_16_byte_t normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position); + vst(out_ptr + i, normalized_value); + } + + const T sum_inversed = vget_lane<0>(vec_sum_inversed); + + /* Run remaining elements */ + for(; i < input_width; ++i) + { + out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position); + } } - *(reinterpret_cast(_sum.ptr())) = sum; - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + }, + in_it, max_it, out_it); } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta) +template +void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp, + ITensor &out, const float beta, const Window &window) { - Window window_max(window); - window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + const int start_x = in.info()->valid_region().anchor.x(); + const int input_width = in.info()->valid_region().shape.x(); - Window max_slice = window_max.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); + Iterator in_it(&in, window); + Iterator max_it(&max, window); + Iterator out_it(&out, window); - constexpr int step = 4; - const int long_steps = in->info()->valid_region().shape.x() / step; - const int small_steps = in->info()->valid_region().shape.x() % step; - - do + execute_window_loop(window, [&](const Coordinates &) { - Iterator input(in, in_slice); - Iterator exp(out, in_slice); - Iterator _max(max, max_slice); - Iterator _sum(sum, max_slice); - - // Get pointers - auto in_ptr = reinterpret_cast(input.ptr()); - auto exp_ptr = reinterpret_cast(exp.ptr()); - - // Init sum to zero - float32x4_t vec_sum_value = vdupq_n_f32(0.0f); + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); - // Get max value - const auto max_ptr = reinterpret_cast(_max.ptr()); - const float32x4_t vec_max = vdupq_n_f32(*max_ptr); + T sum_inversed; - // Run neon loop - for(int i = 0; i < long_steps; ++i) + /* Compute exponentials and sum */ { - float32x4_t vec_elements = vld1q_f32(in_ptr); - vec_elements = vsubq_f32(vec_elements, vec_max); - vec_elements = vmulq_n_f32(vec_elements, beta); - vec_elements = vexpq_f32(vec_elements); - - vst1q_f32(exp_ptr, vec_elements); - vec_sum_value = vaddq_f32(vec_elements, vec_sum_value); - - in_ptr += step; - exp_ptr += step; + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = vdup_n>(max_val); + + /* Init sum to zero */ + auto vec_sum = vdup_n>(0); + + /* Loop over row and compute exponentials and sum */ + int i = 0; + constexpr int vec_size = vec_size_of(vec_sum); + for(; i <= (input_width - vec_size); i += vec_size) + { + auto vec_elements = vld>(in_ptr + i); + vec_elements = vsub(vec_elements, vec_max); + vec_elements = vexp(vmul_n(vec_elements, beta)); + vec_sum = vadd(vec_sum, vec_elements); + vst(tmp_ptr + i, vec_elements); + } + /* Reduce sum */ + const auto sum_8_byte = vadd(vget_high(vec_sum), vget_low(vec_sum)); + T sum = reduce_add([](T a, T b) -> T { return a + b; }, sum_8_byte); + + /* Run remaining elements */ + for(; i < input_width; ++i) + { + T element = std::exp((in_ptr[i] - max_val) * beta); + sum += element; + tmp_ptr[i] = element; + } + + sum_inversed = T(1) / sum; } - // Reduce sum - float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value)); - carry_addition = vpadd_f32(carry_addition, carry_addition); - float sum = vget_lane_f32(carry_addition, 0); - - // Run remaining elements - for(int i = 0; i < small_steps; ++i) + /* Normalize exponentials */ { - float element = std::exp((in_ptr[i] - *max_ptr) * beta); - exp_ptr[i] = element; - sum += element; + /* Loop over row and compute softmax */ + int i = 0; + { + constexpr int vec_size = vec_size_of(vec_16_byte_t {}); + for(; i <= (input_width - vec_size); i += vec_size) + { + auto vec_in = vld>(tmp_ptr + i); + vec_16_byte_t normalized_value = vmul_n(vec_in, sum_inversed); + vst(out_ptr + i, normalized_value); + } + } + /* Run remaining elements */ + for(; i < input_width; ++i) + { + out_ptr[i] = tmp_ptr[i] * sum_inversed; + } } - - *(reinterpret_cast(_sum.ptr())) = sum; - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); + }, + in_it, max_it, out_it); } -} //namespace +} // namespace -NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel() - : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr), _beta(1.0f) +NELogits1DSoftmaxKernel::NELogits1DSoftmaxKernel() + : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr) { } -void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta) +void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - + ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp); + ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), max->info(), output->info(), tmp->info()); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info(), beta)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*input->info(), *max->info(), *output->info(), beta, *tmp->info())); + // Configure kernel window + auto win_config = validate_and_configure_window_logits_softmax(*input->info(), *max->info(), *output->info(), *tmp->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); switch(input->info()->data_type()) { + case DataType::QASYMM8: + _func = &logits_1d_softmax_qasymm8; + break; case DataType::QS8: - _func = &logits_1d_shift_exp_sum_qs8; + _func = &logits_1d_softmax_fixed_point; break; case DataType::QS16: - _func = &logits_1d_shift_exp_sum_qs16; - break; - case DataType::F32: - _func = &logits_1d_shift_exp_sum_f32; + _func = &logits_1d_softmax_fixed_point; break; - case DataType::F16: #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - _func = &logits_1d_shift_exp_sum_f16; + case DataType::F16: + _func = &logits_1d_softmax_float; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + _func = &logits_1d_softmax_float; + break; default: ARM_COMPUTE_ERROR("Unsupported data type."); break; @@ -686,224 +930,37 @@ void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor _input = input; _max = max; _output = output; - _sum = sum; _beta = beta; + _tmp = tmp; - // Configure kernel window - auto win_config = validate_and_configure_window_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NELogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta) +Status NELogits1DSoftmaxKernel::validate(const ITensorInfo *input, const ITensorInfo *max, + const ITensorInfo *output, const float beta, const ITensorInfo *tmp) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_shift_exp_sum(input, max, output, sum, beta)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_shift_exp_sum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*input, *max, *output, beta, *tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_softmax(*input->clone(), *max->clone(), *output->clone(), *tmp->clone()).first); return Status{}; } -void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info) +void NELogits1DSoftmaxKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input, _max, _output, _sum, window, _beta); -} + const unsigned int num_elems_processed_per_iteration = _input->info()->valid_region().shape.x(); + const unsigned int tmp_size_for_thread = _tmp->info()->element_size() * num_elems_processed_per_iteration; -namespace -{ -void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) -{ - Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window sum_slice = window_sum.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); + ARM_COMPUTE_ERROR_ON(_tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); - const int fixed_point_position = in->info()->fixed_point_position(); + void *tmp_for_thread = _tmp->buffer() + (info.thread_id * tmp_size_for_thread); - do - { - Iterator input(in, in_slice); - Iterator _sum(sum, sum_slice); - Iterator output(out, in_slice); - - const int8_t sum_value = *reinterpret_cast(_sum.ptr()); - const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - const qint8x16_t vec_in = vld1q_qs8(in_ptr); - const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position); - - vst1q_qs8(out_ptr, normalized_value); - }, - input, output); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); + (*_func)(*_input, *_max, tmp_for_thread, *_output, _beta, window); } -void logits_1d_norm_qs16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) -{ - Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window sum_slice = window_sum.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); - const int fixed_point_position = in->info()->fixed_point_position(); - - do - { - Iterator input(in, in_slice); - Iterator _sum(sum, sum_slice); - Iterator output(out, in_slice); - - const int16_t sum_value = *reinterpret_cast(_sum.ptr()); - const qint16x8_t vec_sum_inversed = vqrecipq_qs16(vdupq_n_qs16(sum_value), fixed_point_position); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - const qint16x8_t vec_in = vld1q_qs16(in_ptr); - const qint16x8_t normalized_value = vqmulq_qs16(vec_in, vec_sum_inversed, fixed_point_position); - - vst1q_qs16(out_ptr, normalized_value); - }, - input, output); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); -} -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void logits_1d_norm_f16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) -{ - Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window sum_slice = window_sum.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); - - do - { - Iterator input(in, in_slice); - Iterator _sum(sum, sum_slice); - Iterator output(out, in_slice); - - const float16_t sum_value = *reinterpret_cast(_sum.ptr()); - const float16x8_t vec_sum_inversed = vdupq_n_f16(1.0f / sum_value); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - const float16x8_t vec_in = vld1q_f16(in_ptr); - const float16x8_t normalized_value = vmulq_f16(vec_in, vec_sum_inversed); - - vst1q_f16(out_ptr, normalized_value); - }, - input, output); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) -{ - Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - Window sum_slice = window_sum.first_slice_window_1D(); - Window in_slice = window.first_slice_window_1D(); - - do - { - Iterator input(in, in_slice); - Iterator _sum(sum, sum_slice); - Iterator output(out, in_slice); - - const float sum_value = *reinterpret_cast(_sum.ptr()); - const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value); - - execute_window_loop(in_slice, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - const float32x4_t vec_in = vld1q_f32(in_ptr); - const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed); - - vst1q_f32(out_ptr, normalized_value); - }, - input, output); - } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); -} -} // namespace - -NELogits1DNormKernel::NELogits1DNormKernel() - : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr) -{ -} - -void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_norm(input->info(), sum->info(), output->info())); - - _input = input; - _sum = sum; - _output = output; - - switch(input->info()->data_type()) - { - case DataType::QS8: - _func = &logits_1d_norm_qs8; - break; - case DataType::QS16: - _func = &logits_1d_norm_qs16; - break; - case DataType::F32: - _func = &logits_1d_norm_f32; - break; - case DataType::F16: -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - _func = &logits_1d_norm_f16; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - break; - } - - // Configure kernel window - auto win_config = validate_and_configure_window_logits_1d_norm(input->info(), sum->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); -} - -Status NELogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_norm(input, sum, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_norm(input->clone().get(), sum->clone().get(), output->clone().get()).first); - - return Status{}; -} - -void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(_input, _sum, _output, window); -} +} // namespace arm_compute diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp index 0150a95cc6..836c3794c2 100644 --- a/src/core/SubTensorInfo.cpp +++ b/src/core/SubTensorInfo.cpp @@ -92,7 +92,7 @@ std::unique_ptr SubTensorInfo::clone() const return clone_obj; } -ITensorInfo &SubTensorInfo::set_tensor_shape(TensorShape shape) +ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape) { ARM_COMPUTE_ERROR_ON(_parent == nullptr); diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp index 24988e2217..bd0c85f3d4 100644 --- a/src/core/TensorInfo.cpp +++ b/src/core/TensorInfo.cpp @@ -348,7 +348,7 @@ ITensorInfo &TensorInfo::set_format(Format format) return *this; } -ITensorInfo &TensorInfo::set_tensor_shape(TensorShape shape) +ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape) { _tensor_shape = shape; _offset_first_element_in_bytes = 0; @@ -378,7 +378,7 @@ ITensorInfo &TensorInfo::set_fixed_point_position(int fixed_point_position) return *this; } -ITensorInfo &TensorInfo::set_quantization_info(QuantizationInfo quantization_info) +ITensorInfo &TensorInfo::set_quantization_info(const QuantizationInfo &quantization_info) { _quantization_info = quantization_info; return *this; diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 8e6773c5b1..4fb83007c5 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,7 +32,7 @@ using namespace arm_compute; NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp() + : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp() { } @@ -40,31 +40,22 @@ void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - // Create intermediate tensors shapes - TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()); - _tmp.allocator()->init(tensor_info_tmp); + // Configure Kernels + _max_kernel.configure(input, &_max); + _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE); + _softmax_kernel.configure(input, &_max, output, beta, &_tmp); - TensorShape shape = input->info()->tensor_shape(); - shape.set(0, 1); - TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()); - _max.allocator()->init(tensor_info_max_sum); - _sum.allocator()->init(tensor_info_max_sum); + // Init intermediate tensors + _max.allocator()->init(*_max.info()); + _tmp.allocator()->init(*_tmp.info()); // Manage intermediate buffers - _memory_group.manage(&_tmp); _memory_group.manage(&_max); - _memory_group.manage(&_sum); - - // Configure Kernels - _max_kernel.configure(input, &_max); - _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta); - _norm_kernel.configure(&_tmp, &_sum, output); - _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE); + _memory_group.manage(&_tmp); // Allocate intermediate tensors - _tmp.allocator()->allocate(); _max.allocator()->allocate(); - _sum.allocator()->allocate(); + _tmp.allocator()->allocate(); } Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta) @@ -72,14 +63,12 @@ Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - TensorShape max_sum_shape = input->tensor_shape(); - max_sum_shape.set(0, 1); - - TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape)); + const TensorShape max_shape = TensorShape(input->tensor_shape()).set(0, 1); + const TensorInfo tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding(); + const TensorInfo dont_care; ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta)); - ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output)); + ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care)); return Status{}; } @@ -90,8 +79,7 @@ void NESoftmaxLayer::run() NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); NEScheduler::get().schedule(&_max_kernel, Window::DimY); - NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY); - NEScheduler::get().schedule(&_norm_kernel, Window::DimY); + NEScheduler::get().schedule(&_softmax_kernel, Window::DimY); _memory_group.release(); } -- cgit v1.2.1