From 5a5945387e70f62e6e1e95a177fae261d7570443 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 3 Dec 2018 14:30:05 +0000 Subject: COMPMID-1809: Remove padding in NEGEMMConvolutionLayer 64-bit path. Change-Id: I1806591a2c73a1f057f13d8c6107d7b9796a82c8 Reviewed-on: https://review.mlplatform.org/370 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou --- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 551 ++++++------------ .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 643 ++++++++++++--------- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 40 +- .../NEON/functions/NEArithmeticAddition.cpp | 12 +- 4 files changed, 559 insertions(+), 687 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 5ce79f1007..97cb9ceb2e 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/QAsymm8.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" @@ -60,29 +61,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - constexpr unsigned int num_elems_processed_per_iteration = 16; - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - bool window_changed = false; + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); - if(output != nullptr && (output->total_size() != 0)) + if(output != nullptr) { - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - window_changed = update_window_and_padding(win, - AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration), - output_access); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input->clone()); - output_access.set_valid_region(win, input->valid_region()); - } - else - { - // In-place computation - window_changed = update_window_and_padding(win, - AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); } } // namespace @@ -101,15 +94,13 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat if(output != nullptr) { - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); _output = output; } ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); - ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU), + ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) + && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU), "For QASYMM8 only relu and lower/upper bounded relu are supported"); // Activation functions : FP32 @@ -176,337 +167,129 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat ICPPKernel::configure(win_config.second); } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template -typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) +typename std::enable_if::value, void>::type +NEActivationLayerKernel::activation(const Window &window) { - Iterator input(_input, window); - Iterator output(_output, window); + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const ActivationFunction act = F; - static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const float16x8_t CONST_1_H = vdupq_n_f16(1.f); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); - const float16x8_t a = vdupq_n_f16(_act_info.a()); - const float16x4_t a_h = vdup_n_f16(_act_info.a()); - const float16x8_t b = vdupq_n_f16(_act_info.b()); + const auto const_1 = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); + const auto const_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + const auto va = wrapper::vdup_n(static_cast(_act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast(_act_info.b()), ExactTagType{}); + const auto a = static_cast(_act_info.a()); + const auto b = static_cast(_act_info.b()); - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const float16x8x2_t in = vld2q_f16(input_ptr); - float16x8x2_t tmp = { {} }; + wrapper::traits::neon_bitvector_t tmp; - switch(F) + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationFunction::ABS: - tmp = - { - { - vabsq_f16(in.val[0]), - vabsq_f16(in.val[1]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])), - vminq_f16(a, vmaxq_f16(CONST_0, in.val[1])) - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_f16(a, vmaxq_f16(b, in.val[0])), - vminq_f16(a, vmaxq_f16(b, in.val[1])) - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vaddq_f16(b, vmulq_f16(a, in.val[0])), - vaddq_f16(b, vmulq_f16(a, in.val[1])) - } - }; - break; - case ActivationFunction::LOGISTIC: - { - tmp = - { - { - vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))), - vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1])))) - } - }; - } - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_f16(CONST_0, in.val[0]), - vmaxq_f16(CONST_0, in.val[1]) - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])), - vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1])) - } - }; - break; - case ActivationFunction::SOFT_RELU: - { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x4x2_t in0 = - { - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))), - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[0])))))), - }; - - const float16x4x2_t in1 = - { - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[1])))))), - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[1])))))), - }; - - tmp = - { - { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), - } - }; - } - break; - case ActivationFunction::SQRT: - tmp = - { - { - vinvq_f16(vinvsqrtq_f16(in.val[0])), - vinvq_f16(vinvsqrtq_f16(in.val[1])), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vmulq_f16(in.val[0], in.val[0]), - vmulq_f16(in.val[1], in.val[1]) - } - }; - break; - case ActivationFunction::TANH: + const auto vin = wrapper::vloadq(input_ptr + x); + switch(act) { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x8x2_t mul = - { - vmulq_f16(b, in.val[0]), - vmulq_f16(b, in.val[1]) - }; - const float16x4x2_t in0 = - { - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[0]))))), - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[0]))))), - }; - - const float16x4x2_t in1 = - { - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[1]))))), - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[1]))))), - }; - - tmp = - { - { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), - } - }; + case ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationFunction::SOFT_RELU: + tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))); + break; + case ActivationFunction::SQRT: + tmp = wrapper::vinv(wrapper::vinvsqrt(vin)); + break; + case ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); } - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - break; + wrapper::vstore(output_ptr + x, tmp); } - vst2q_f16(output_ptr, tmp); - }, - input, output); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) -{ - Iterator input(_input, window); - Iterator output(_output, window); - - static const float32x4_t CONST_1 = vdupq_n_f32(1.f); - static const float32x4_t CONST_0 = vdupq_n_f32(0.f); - const float32x4_t a = vdupq_n_f32(_act_info.a()); - const float32x4_t b = vdupq_n_f32(_act_info.b()); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const float32x4x4_t in = + // Compute left-over elements + for(; x < window_end_x; ++x) { + const T in = *(reinterpret_cast(input_ptr + x)); + T tmp; + switch(act) { - vld1q_f32(input_ptr), - vld1q_f32(input_ptr + 4), - vld1q_f32(input_ptr + 8), - vld1q_f32(input_ptr + 12) + case ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationFunction::LOGISTIC: + tmp = static_cast(1) / (static_cast(1) + std::exp(-in)); + break; + case ActivationFunction::RELU: + tmp = std::max(static_cast(0), in); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = std::min(a, std::max(static_cast(0), in)); + break; + case ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min(a, std::max(b, in)); + break; + case ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationFunction::SOFT_RELU: + tmp = std::log(static_cast(1) + std::exp(in)); + break; + case ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); } - }; - float32x4x4_t tmp = { {} }; - - switch(F) - { - case ActivationFunction::ABS: - tmp = - { - { - vabsq_f32(in.val[0]), - vabsq_f32(in.val[1]), - vabsq_f32(in.val[2]), - vabsq_f32(in.val[3]), - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vmlaq_f32(b, a, in.val[0]), - vmlaq_f32(b, a, in.val[1]), - vmlaq_f32(b, a, in.val[2]), - vmlaq_f32(b, a, in.val[3]), - } - }; - break; - case ActivationFunction::LOGISTIC: - tmp = - { - { - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))), - } - }; - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_f32(CONST_0, in.val[0]), - vmaxq_f32(CONST_0, in.val[1]), - vmaxq_f32(CONST_0, in.val[2]), - vmaxq_f32(CONST_0, in.val[3]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])), - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_f32(a, vmaxq_f32(b, in.val[0])), - vminq_f32(a, vmaxq_f32(b, in.val[1])), - vminq_f32(a, vmaxq_f32(b, in.val[2])), - vminq_f32(a, vmaxq_f32(b, in.val[3])), - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])), - vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])), - vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])), - vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])), - } - }; - break; - case ActivationFunction::SOFT_RELU: - tmp = - { - { - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))), - } - }; - break; - case ActivationFunction::SQRT: - tmp = - { - { - vinvq_f32(vinvsqrtq_f32(in.val[0])), - vinvq_f32(vinvsqrtq_f32(in.val[1])), - vinvq_f32(vinvsqrtq_f32(in.val[2])), - vinvq_f32(vinvsqrtq_f32(in.val[3])), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vmulq_f32(in.val[0], in.val[0]), - vmulq_f32(in.val[1], in.val[1]), - vmulq_f32(in.val[2], in.val[2]), - vmulq_f32(in.val[3], in.val[3]), - } - }; - break; - case ActivationFunction::TANH: - tmp = - { - { - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))), - } - }; - break; - default: - break; + *(output_ptr + x) = tmp; } - - vst1q_f32(output_ptr, tmp.val[0]); - vst1q_f32(output_ptr + 4, tmp.val[1]); - vst1q_f32(output_ptr + 8, tmp.val[2]); - vst1q_f32(output_ptr + 12, tmp.val[3]); }, input, output); } @@ -514,13 +297,25 @@ typename std::enable_if::value, void>::type NEActivationL template typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) { - Iterator input(_input, window); - Iterator output(_output, window); - const QuantizationInfo qi_in = _input->info()->quantization_info(); - const QuantizationInfo qi_out = _output->info()->quantization_info(); - const qasymm8x16_t a = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset)); - const qasymm8x16_t b = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset)); - const qasymm8x16_t CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset)); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const ActivationFunction act = F; + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + + const QuantizationInfo qi_in = _input->info()->quantization_info(); + const QuantizationInfo qi_out = _output->info()->quantization_info(); + const qasymm8x16_t va = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset)); + const qasymm8x16_t vb = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset)); + const qasymm8_t a = sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset); + const qasymm8_t b = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset); + const qasymm8_t const_0 = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset); + const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); // Initialise scale/offset for re-quantization float s = qi_in.scale / qi_out.scale; @@ -528,34 +323,72 @@ typename std::enable_if::value, void>::type NEActivat float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const qasymm8x16_t in = vld1q_u8(input_ptr); - qasymm8x16_t tmp = {}; + wrapper::traits::neon_bitvector_t tmp; - switch(F) + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationFunction::LU_BOUNDED_RELU: + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationFunction::RELU) + { // Perform activation - tmp = vminq_u8(a, vmaxq_u8(b, in)); + tmp = vmaxq_u8(vconst_0, vin); // Re-quantize to new output space tmp = vmlaq_qasymm8(tmp, vs, vo); - break; - case ActivationFunction::RELU: + } + else if(act == ActivationFunction::BOUNDED_RELU) + { // Perform activation - tmp = vmaxq_u8(CONST_0, in); + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); // Re-quantize to new output space tmp = vmlaq_qasymm8(tmp, vs, vo); - break; - default: - ARM_COMPUTE_ERROR("Function not implemented"); - break; + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - vst1q_u8(output_ptr, tmp); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + T in = *(reinterpret_cast(input_ptr + x)); + T tmp; + if(act == ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = std::max(0, std::min(static_cast(tmp * s + o), 255)); + } + else if(act == ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = std::max(0, std::min(static_cast(tmp * s + o), 255)); + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = std::max(0, std::min(static_cast(tmp * s + o), 255)); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } }, input, output); } diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index 954a2c1754..e74833cd41 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -47,337 +48,413 @@ class Coordinates; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; - -void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +template +void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); - }, - input1, input2, output); -} + ARM_COMPUTE_UNUSED(policy); -void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - execute_window_loop(window, [&](const Coordinates & id) - { - vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); - }, - input1, input2, output); -} + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); -inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) -{ - const int16x8x2_t res = - { - { - vaddq_s16(a.val[0], b.val[0]), - vaddq_s16(a.val[1], b.val[1]) - } - }; + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); - return res; -} + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); -inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b) -{ - const float32x4x4_t res = + if(is_broadcast_across_x) { - { - vaddq_f32(a.val[0], b.val[0]), - vaddq_f32(a.val[1], b.val[1]), - vaddq_f32(a.val[2], b.val[2]), - vaddq_f32(a.val[3], b.val[3]) - } - }; + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - return res; -} + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); -inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) -{ - const int16x8x2_t res = - { + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { - vqaddq_s16(a.val[0], b.val[0]), - vqaddq_s16(a.val[1], b.val[1]) - } - }; + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - return res; -} + const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b) -{ - const float16x8x2_t res = + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); + } + else { - { - vaddq_f16(a.val[0], b.val[0]), - vaddq_f16(a.val[1], b.val[1]) - } - }; + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - return res; -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); -void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + execute_window_loop(win, [&](const Coordinates & id) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - execute_window_loop(window, [&](const Coordinates & id) - { - const float16x8x2_t a = vld2q_f16(reinterpret_cast(input1.ptr())); - const float16x8x2_t b = vld2q_f16(reinterpret_cast(input2.ptr())); + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } - vst2q_f16(reinterpret_cast(output.ptr()), vadd2q_f16(a, b)); - }, - input1, input2, output); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_UNUSED(in1); - ARM_COMPUTE_UNUSED(in2); - ARM_COMPUTE_UNUSED(out); - ARM_COMPUTE_UNUSED(window); - ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a"); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); + } } -void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + ARM_COMPUTE_UNUSED(policy); - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x4_t a = vld4q_f32(reinterpret_cast(input1.ptr())); - const float32x4x4_t b = vld4q_f32(reinterpret_cast(input2.ptr())); + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - vst4q_f32(reinterpret_cast(output.ptr()), vadd4q_f32(a, b)); - }, - input1, input2, output); -} + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); -void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const float output_scale = out->info()->quantization_info().scale; + const float invoutput_scale = 1.f / output_scale; + const int output_offset = out->info()->quantization_info().offset; const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale); const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / out->info()->quantization_info().scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset); const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset); - const float32x4_t voffseto = vdupq_n_f32(out->info()->quantization_info().offset); + const float32x4_t voffseto = vdupq_n_f32(output_offset); - execute_window_loop(window, [&](const Coordinates & id) + if(is_broadcast_across_x) { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); - - const float32x4x4_t af = + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info(); + const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + } + }; + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); } - }; - const float32x4x4_t bf = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + *(output_ptr + x) = std::max(0, std::min(static_cast((afs + bfs) * invoutput_scale + output_offset), 255)); } - }; + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); + const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); - const int32x4x4_t rf = + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + } + }; + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); } - }; - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - vst1q_u8(output.ptr(), vcombine_u8(pa, pb)); - }, - input1, input2, output); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale; + const float bfs = static_cast((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale; + *(output_ptr + x) = std::max(0, std::min(static_cast((afs + bfs) * invoutput_scale + output_offset), 255)); + } + }, + input1, input2, output); + } } -void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = vld2q_s16(reinterpret_cast(input1.ptr())); - const int16x8x2_t b = vld2q_s16(reinterpret_cast(input2.ptr())); + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - vst2q_s16(reinterpret_cast(output.ptr()), vadd2q_s16(a, b)); - }, - input1, input2, output); -} + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); -void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(win, [&](const Coordinates & id) { - const int16x8x2_t a = vld2q_s16(reinterpret_cast(input1.ptr())); - const int16x8x2_t b = vld2q_s16(reinterpret_cast(input2.ptr())); - - vst2q_s16(reinterpret_cast(output.ptr()), vqadd2q_s16(a, b)); - }, - input1, input2, output); -} - -void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = + if(policy == ConvertPolicy::WRAP) { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vld1q_s16(reinterpret_cast(input1.ptr())), - vld1q_s16(reinterpret_cast(input1.ptr()) + 8) + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); } - }; - const uint8x16_t b = vld1q_u8(input2.ptr()); - - vst1q_s16(reinterpret_cast(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); - }, - input1, input2, output); -} - -void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = *(input1_ptr + x) + static_cast(*(input2_ptr + x)); + } + } + else { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vld1q_s16(reinterpret_cast(input1.ptr())), - vld1q_s16(reinterpret_cast(input1.ptr()) + 8) + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); } - }; - const uint8x16_t b = vld1q_u8(input2.ptr()); - vst1q_s16(reinterpret_cast(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast(*(input2_ptr + x))); + } + } }, input1, input2, output); } -inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window) { - //Simply swap the two input buffers: - add_wrap_S16_U8_S16(input2, input1, output, window); + // Simply swap the two input buffers: + add_S16_U8_S16(input2, input1, output, policy, window); } -inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - //Simply swap the two input buffers: - add_saturate_S16_U8_S16(input2, input1, output, window); -} + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); -void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - execute_window_loop(window, [&](const Coordinates & id) + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates & id) { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int16x8x2_t a_s16 = + if(policy == ConvertPolicy::WRAP) { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); } - }; - const int16x8x2_t b_s16 = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + *(output_ptr + x) = static_cast(*(input1_ptr + x)) + static_cast(*(input2_ptr + x)); } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0])); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1])); - }, - input1, input2, output); -} - -void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); - - const int16x8x2_t a_s16 = + } + else { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); } - }; - const int16x8x2_t b_s16 = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + *(output_ptr + x) = wrapper::add_sat(static_cast(*(input1_ptr + x)), + static_cast(*(input2_ptr + x))); } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0])); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1])); + } }, input1, input2, output); } @@ -393,6 +470,9 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type()) + || (input2.data_type() != output.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); // Validate in case of configured output if(output.total_size() > 0) @@ -443,27 +523,20 @@ std::pair validate_and_configure_window(ITensorInfo &input1, ITe } } - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(input1); - Window win_input2 = win.broadcast_if_dimension_le_one(input2); + Window win = calculate_max_window(valid_region, Steps()); - AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); + // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output.num_dimensions()); + output.set_valid_region(valid_region); - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); + ; } } // namespace NEArithmeticAdditionKernel::NEArithmeticAdditionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr) + : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy() { } @@ -478,27 +551,30 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor static std::map map_function = { - { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 }, - { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 }, - { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 }, - { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 }, - { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 }, - { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 }, - { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 }, - { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 }, - { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 }, - { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 }, - { "add_wrap_F32_F32_F32", &add_F32_F32_F32 }, - { "add_saturate_F32_F32_F32", &add_F32_F32_F32 }, - { "add_wrap_F16_F16_F16", &add_F16_F16_F16 }, - { "add_saturate_F16_F16_F16", &add_F16_F16_F16 }, { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 }, { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 }, + { "add_wrap_U8_U8_U8", &add_same }, + { "add_saturate_U8_U8_U8", &add_same }, + { "add_wrap_S16_U8_S16", &add_S16_U8_S16 }, + { "add_saturate_S16_U8_S16", &add_S16_U8_S16 }, + { "add_wrap_U8_S16_S16", &add_U8_S16_S16 }, + { "add_saturate_U8_S16_S16", &add_U8_S16_S16 }, + { "add_wrap_U8_U8_S16", &add_U8_U8_S16 }, + { "add_saturate_U8_U8_S16", &add_U8_U8_S16 }, + { "add_wrap_S16_S16_S16", &add_same }, + { "add_saturate_S16_S16_S16", &add_same }, + { "add_wrap_F32_F32_F32", &add_same }, + { "add_saturate_F32_F32_F32", &add_same }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { "add_wrap_F16_F16_F16", &add_same }, + { "add_saturate_F16_F16_F16", &add_same }, +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ }; _input1 = input1; _input2 = input2; _output = output; + _policy = policy; std::string function_to_call("add_"); function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; @@ -533,12 +609,5 @@ void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input1, _input2, _output, window); -} - -BorderSize NEArithmeticAdditionKernel::border_size() const -{ - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); + (*_func)(_input1, _input2, _output, _policy, window); } diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 024c4f8863..f0ac695b20 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -66,37 +66,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - // Note: This kernel performs 16 elements per iteration. - // However, since we use a left-over for loop, we cannot have any read or write out of memory - // For this reason num_elems_processed_per_iteration is set to 1 - constexpr unsigned int num_elems_processed_per_iteration = 1; - // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8)); // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, - input_access); - - if(output->total_size() != 0) - { - output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); - } + Window win = calculate_max_window(*input, Steps()); - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1)); - window_changed = window_changed || update_window_and_padding(win, bias_access); - } + // NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); } } // namespace @@ -269,7 +252,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const _max = max; // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); + auto win_config = validate_and_configure_window(input->info(), output->info()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); @@ -282,10 +265,7 @@ Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (bias != nullptr) ? bias->clone().get() : nullptr, - output->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index 677e9f676f..b1550778c3 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,16 +36,6 @@ void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor * auto k = arm_compute::support::cpp14::make_unique(); k->configure(input1, input2, output, policy); _kernel = std::move(k); - - if(output->info()->dimension(0) > 1) - { - ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if(broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } } Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) { -- cgit v1.2.1