diff options
Diffstat (limited to 'src/cpu/kernels/add/generic/neon/impl.h')
-rw-r--r-- | src/cpu/kernels/add/generic/neon/impl.h | 145 |
1 files changed, 87 insertions, 58 deletions
diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h index fb786c5bc1..faa99baffe 100644 --- a/src/cpu/kernels/add/generic/neon/impl.h +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -35,7 +36,8 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>; @@ -53,7 +55,7 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -69,31 +71,36 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; - } - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) + ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) + : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) + ? wrapper::add_sat(broadcast_value, non_broadcast_v) + : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -106,31 +113,34 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = + (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = + (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); } } @@ -138,17 +148,36 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition); - -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); - -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); template <typename ScalarType> -void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template <typename ScalarType> -void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H |