From 5a5945387e70f62e6e1e95a177fae261d7570443 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 3 Dec 2018 14:30:05 +0000 Subject: COMPMID-1809: Remove padding in NEGEMMConvolutionLayer 64-bit path. Change-Id: I1806591a2c73a1f057f13d8c6107d7b9796a82c8 Reviewed-on: https://review.mlplatform.org/370 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou --- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 643 ++++++++++++--------- 1 file changed, 356 insertions(+), 287 deletions(-) (limited to 'src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp') diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index 954a2c1754..e74833cd41 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -47,337 +48,413 @@ class Coordinates; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; - -void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +template +void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); - }, - input1, input2, output); -} + ARM_COMPUTE_UNUSED(policy); -void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - execute_window_loop(window, [&](const Coordinates & id) - { - vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); - }, - input1, input2, output); -} + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); -inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) -{ - const int16x8x2_t res = - { - { - vaddq_s16(a.val[0], b.val[0]), - vaddq_s16(a.val[1], b.val[1]) - } - }; + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); - return res; -} + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); -inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b) -{ - const float32x4x4_t res = + if(is_broadcast_across_x) { - { - vaddq_f32(a.val[0], b.val[0]), - vaddq_f32(a.val[1], b.val[1]), - vaddq_f32(a.val[2], b.val[2]), - vaddq_f32(a.val[3], b.val[3]) - } - }; + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - return res; -} + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); -inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) -{ - const int16x8x2_t res = - { + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { - vqaddq_s16(a.val[0], b.val[0]), - vqaddq_s16(a.val[1], b.val[1]) - } - }; + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - return res; -} + const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b) -{ - const float16x8x2_t res = + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); + } + else { - { - vaddq_f16(a.val[0], b.val[0]), - vaddq_f16(a.val[1], b.val[1]) - } - }; + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - return res; -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); -void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + execute_window_loop(win, [&](const Coordinates & id) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - execute_window_loop(window, [&](const Coordinates & id) - { - const float16x8x2_t a = vld2q_f16(reinterpret_cast(input1.ptr())); - const float16x8x2_t b = vld2q_f16(reinterpret_cast(input2.ptr())); + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } - vst2q_f16(reinterpret_cast(output.ptr()), vadd2q_f16(a, b)); - }, - input1, input2, output); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_UNUSED(in1); - ARM_COMPUTE_UNUSED(in2); - ARM_COMPUTE_UNUSED(out); - ARM_COMPUTE_UNUSED(window); - ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a"); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); + } } -void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + ARM_COMPUTE_UNUSED(policy); - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x4_t a = vld4q_f32(reinterpret_cast(input1.ptr())); - const float32x4x4_t b = vld4q_f32(reinterpret_cast(input2.ptr())); + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - vst4q_f32(reinterpret_cast(output.ptr()), vadd4q_f32(a, b)); - }, - input1, input2, output); -} + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); -void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const float output_scale = out->info()->quantization_info().scale; + const float invoutput_scale = 1.f / output_scale; + const int output_offset = out->info()->quantization_info().offset; const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale); const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / out->info()->quantization_info().scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset); const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset); - const float32x4_t voffseto = vdupq_n_f32(out->info()->quantization_info().offset); + const float32x4_t voffseto = vdupq_n_f32(output_offset); - execute_window_loop(window, [&](const Coordinates & id) + if(is_broadcast_across_x) { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); - - const float32x4x4_t af = + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info(); + const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + } + }; + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); } - }; - const float32x4x4_t bf = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + *(output_ptr + x) = std::max(0, std::min(static_cast((afs + bfs) * invoutput_scale + output_offset), 255)); } - }; + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); + const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); - const int32x4x4_t rf = + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + } + }; + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); } - }; - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - vst1q_u8(output.ptr(), vcombine_u8(pa, pb)); - }, - input1, input2, output); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale; + const float bfs = static_cast((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale; + *(output_ptr + x) = std::max(0, std::min(static_cast((afs + bfs) * invoutput_scale + output_offset), 255)); + } + }, + input1, input2, output); + } } -void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = vld2q_s16(reinterpret_cast(input1.ptr())); - const int16x8x2_t b = vld2q_s16(reinterpret_cast(input2.ptr())); + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - vst2q_s16(reinterpret_cast(output.ptr()), vadd2q_s16(a, b)); - }, - input1, input2, output); -} + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); -void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(win, [&](const Coordinates & id) { - const int16x8x2_t a = vld2q_s16(reinterpret_cast(input1.ptr())); - const int16x8x2_t b = vld2q_s16(reinterpret_cast(input2.ptr())); - - vst2q_s16(reinterpret_cast(output.ptr()), vqadd2q_s16(a, b)); - }, - input1, input2, output); -} - -void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = + if(policy == ConvertPolicy::WRAP) { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vld1q_s16(reinterpret_cast(input1.ptr())), - vld1q_s16(reinterpret_cast(input1.ptr()) + 8) + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); } - }; - const uint8x16_t b = vld1q_u8(input2.ptr()); - - vst1q_s16(reinterpret_cast(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); - }, - input1, input2, output); -} - -void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = *(input1_ptr + x) + static_cast(*(input2_ptr + x)); + } + } + else { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vld1q_s16(reinterpret_cast(input1.ptr())), - vld1q_s16(reinterpret_cast(input1.ptr()) + 8) + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); } - }; - const uint8x16_t b = vld1q_u8(input2.ptr()); - vst1q_s16(reinterpret_cast(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast(*(input2_ptr + x))); + } + } }, input1, input2, output); } -inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window) { - //Simply swap the two input buffers: - add_wrap_S16_U8_S16(input2, input1, output, window); + // Simply swap the two input buffers: + add_S16_U8_S16(input2, input1, output, policy, window); } -inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - //Simply swap the two input buffers: - add_saturate_S16_U8_S16(input2, input1, output, window); -} + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); -void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - execute_window_loop(window, [&](const Coordinates & id) + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates & id) { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int16x8x2_t a_s16 = + if(policy == ConvertPolicy::WRAP) { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); } - }; - const int16x8x2_t b_s16 = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + *(output_ptr + x) = static_cast(*(input1_ptr + x)) + static_cast(*(input2_ptr + x)); } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0])); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1])); - }, - input1, input2, output); -} - -void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); - - const int16x8x2_t a_s16 = + } + else { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); } - }; - const int16x8x2_t b_s16 = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + *(output_ptr + x) = wrapper::add_sat(static_cast(*(input1_ptr + x)), + static_cast(*(input2_ptr + x))); } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0])); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1])); + } }, input1, input2, output); } @@ -393,6 +470,9 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type()) + || (input2.data_type() != output.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); // Validate in case of configured output if(output.total_size() > 0) @@ -443,27 +523,20 @@ std::pair validate_and_configure_window(ITensorInfo &input1, ITe } } - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(input1); - Window win_input2 = win.broadcast_if_dimension_le_one(input2); + Window win = calculate_max_window(valid_region, Steps()); - AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); + // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output.num_dimensions()); + output.set_valid_region(valid_region); - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); + ; } } // namespace NEArithmeticAdditionKernel::NEArithmeticAdditionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr) + : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy() { } @@ -478,27 +551,30 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor static std::map map_function = { - { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 }, - { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 }, - { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 }, - { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 }, - { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 }, - { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 }, - { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 }, - { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 }, - { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 }, - { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 }, - { "add_wrap_F32_F32_F32", &add_F32_F32_F32 }, - { "add_saturate_F32_F32_F32", &add_F32_F32_F32 }, - { "add_wrap_F16_F16_F16", &add_F16_F16_F16 }, - { "add_saturate_F16_F16_F16", &add_F16_F16_F16 }, { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 }, { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 }, + { "add_wrap_U8_U8_U8", &add_same }, + { "add_saturate_U8_U8_U8", &add_same }, + { "add_wrap_S16_U8_S16", &add_S16_U8_S16 }, + { "add_saturate_S16_U8_S16", &add_S16_U8_S16 }, + { "add_wrap_U8_S16_S16", &add_U8_S16_S16 }, + { "add_saturate_U8_S16_S16", &add_U8_S16_S16 }, + { "add_wrap_U8_U8_S16", &add_U8_U8_S16 }, + { "add_saturate_U8_U8_S16", &add_U8_U8_S16 }, + { "add_wrap_S16_S16_S16", &add_same }, + { "add_saturate_S16_S16_S16", &add_same }, + { "add_wrap_F32_F32_F32", &add_same }, + { "add_saturate_F32_F32_F32", &add_same }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { "add_wrap_F16_F16_F16", &add_same }, + { "add_saturate_F16_F16_F16", &add_same }, +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ }; _input1 = input1; _input2 = input2; _output = output; + _policy = policy; std::string function_to_call("add_"); function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; @@ -533,12 +609,5 @@ void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input1, _input2, _output, window); -} - -BorderSize NEArithmeticAdditionKernel::border_size() const -{ - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); + (*_func)(_input1, _input2, _output, _policy, window); } -- cgit v1.2.1