From fc6744a8b18e70c84d5f98d013809b9796d48b38 Mon Sep 17 00:00:00 2001 From: Sheri Zhang Date: Wed, 13 Jan 2021 15:54:05 +0000 Subject: Make Sub kernel and operator stateless - Rename NEArithmeticSubstractionKernel to CpuSubKernel and move files appropriately - Add CpuSub under src/runtime/cpu/operators Partially resolves: COMPMID-4007 Signed-off-by: Sheri Zhang Change-Id: I4754ca9101d82dccacca744be6d069764a9c6b55 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4868 Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez Tello Comments-Addressed: Arm Jenkins --- Android.bp | 7 +- .../NEON/functions/NEArithmeticSubtraction.h | 68 +- arm_compute/runtime/NEON/functions/NEQLSTMLayer.h | 2 +- src/core/NEON/NEKernels.h | 1 - .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 833 --------------------- .../NEON/kernels/NEArithmeticSubtractionKernel.h | 118 --- src/core/cpu/kernels/CpuSubKernel.cpp | 251 +++++++ src/core/cpu/kernels/CpuSubKernel.h | 98 +++ src/core/cpu/kernels/sub/neon/integer.cpp | 183 +++++ src/core/cpu/kernels/sub/neon/list.h | 162 ++++ src/core/cpu/kernels/sub/neon/qasymm8.cpp | 230 ++++++ src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp | 229 ++++++ src/core/cpu/kernels/sub/neon/qsymm16.cpp | 201 +++++ .../NEON/functions/NEArithmeticSubtraction.cpp | 33 +- src/runtime/cpu/operators/CpuSub.cpp | 46 ++ src/runtime/cpu/operators/CpuSub.h | 86 +++ 16 files changed, 1504 insertions(+), 1044 deletions(-) delete mode 100644 src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp delete mode 100644 src/core/NEON/kernels/NEArithmeticSubtractionKernel.h create mode 100644 src/core/cpu/kernels/CpuSubKernel.cpp create mode 100644 src/core/cpu/kernels/CpuSubKernel.h create mode 100644 src/core/cpu/kernels/sub/neon/integer.cpp create mode 100644 src/core/cpu/kernels/sub/neon/list.h create mode 100644 src/core/cpu/kernels/sub/neon/qasymm8.cpp create mode 100644 src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp create mode 100644 src/core/cpu/kernels/sub/neon/qsymm16.cpp create mode 100644 src/runtime/cpu/operators/CpuSub.cpp create mode 100644 src/runtime/cpu/operators/CpuSub.h diff --git a/Android.bp b/Android.bp index 185097d741..8d6182f820 100644 --- a/Android.bp +++ b/Android.bp @@ -220,7 +220,6 @@ cc_library_static { "src/core/MultiImageInfo.cpp", "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp", "src/core/NEON/kernels/NEAccumulateKernel.cpp", - "src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp", "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp", "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp", "src/core/NEON/kernels/NEBitwiseAndKernel.cpp", @@ -419,6 +418,7 @@ cc_library_static { "src/core/cpu/kernels/CpuFloorKernel.cpp", "src/core/cpu/kernels/CpuPermuteKernel.cpp", "src/core/cpu/kernels/CpuReshapeKernel.cpp", + "src/core/cpu/kernels/CpuSubKernel.cpp", "src/core/cpu/kernels/activation/NEON/fp16.cpp", "src/core/cpu/kernels/activation/NEON/fp32.cpp", "src/core/cpu/kernels/activation/NEON/qasymm8.cpp", @@ -439,6 +439,10 @@ cc_library_static { "src/core/cpu/kernels/add/sve/qsymm16.cpp", "src/core/cpu/kernels/floor/NEON/fp16.cpp", "src/core/cpu/kernels/floor/NEON/fp32.cpp", + "src/core/cpu/kernels/sub/neon/integer.cpp", + "src/core/cpu/kernels/sub/neon/qasymm8.cpp", + "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp", + "src/core/cpu/kernels/sub/neon/qsymm16.cpp", "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp", "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp", "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp", @@ -790,6 +794,7 @@ cc_library_static { "src/runtime/cpu/operators/CpuFloor.cpp", "src/runtime/cpu/operators/CpuPermute.cpp", "src/runtime/cpu/operators/CpuReshape.cpp", + "src/runtime/cpu/operators/CpuSub.cpp", "src/runtime/gpu/cl/operators/ClConcatenate.cpp", "utils/CommonGraphOptions.cpp", "utils/GraphUtils.cpp", diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h index 5d2475b3a4..c741db3223 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,75 +32,13 @@ namespace arm_compute { class ITensor; -namespace experimental -{ -/** Basic function to run @ref NEArithmeticSubtractionKernel - * - * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. - * @note The function performs an arithmetic subtraction between two tensors. - * - * This function calls the following kernels: - * -# @ref NEArithmeticSubtractionKernel - */ -class NEArithmeticSubtraction : public INEOperator -{ -public: - /** Initialise the kernel's inputs, output and conversion policy. - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] input2 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[out] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 - * @param[in] input2 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 - * @param[in] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace experimental - -/** Basic function to run @ref NEArithmeticSubtractionKernel +/** Basic function to run @ref cpu::kernels::CpuSubKernel * * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. * @note The function performs an arithmetic subtraction between two tensors. * * This function calls the following kernels: - * -# @ref NEArithmeticSubtractionKernel + * -# @ref cpu::kernels::CpuSubKernel */ class NEArithmeticSubtraction : public IFunction { diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index 34f51d3d30..743a32c47d 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -51,7 +51,7 @@ class NEGEMMLowpMatrixAReductionKernel; * * -# @ref NEActivationLayer Activation functions (tanh and logistic) * -# @ref NEArithmeticAddition Elementwise addition - * -# @ref NEArithmeticSubtractionKernel Elementwise subtraction + * -# @ref NEArithmeticSubtraction Elementwise subtraction * -# @ref NECopy Copy kernel for copying output_state_out to output * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index f20ecc183e..a678a86e4c 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -27,7 +27,6 @@ /* Header regrouping all the NEON kernels */ #include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h" #include "src/core/NEON/kernels/NEAccumulateKernel.h" -#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h" #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h" #include "src/core/NEON/kernels/NEBitwiseAndKernel.h" diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp deleted file mode 100644 index 187e97dd49..0000000000 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ /dev/null @@ -1,833 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NESymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace -{ -template -inline typename std::enable_if::value, int8_t>::type -quantize(float val, const QuantizationInfo &info) -{ - return quantize_qasymm8_signed(val, info); -} - -template -inline typename std::enable_if::value, uint8_t>::type -quantize(float val, const QuantizationInfo &info) -{ - return quantize_qasymm8(val, info); -} - -template -void sub_same(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat) -{ - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v); - if(is_broadcast_input_2) - { - res = wrapper::vmul(res, wrapper::vdup_n(static_cast(-1), ExactTagType{})); - } - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; - if(is_broadcast_input_2) - { - res = static_cast(-1) * res; - } - - *(output_ptr + x) = res; - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; - } - }, - input1, input2, output); - } -} - -template -void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat) -{ - ARM_COMPUTE_UNUSED(is_sat); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform(); - - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); - const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); - const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); - const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(static_cast(broadcast_value), wrapper::traits::vector_128_tag{}); - - const float32x4x4_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), - } - }; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); - - const float32x4x4_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64_ - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const auto pa = wrapper::vqmov(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const auto pb = wrapper::vqmov(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; - const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; - *(output_ptr + x) = quantize(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info()); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); - const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - - const float32x4x4_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - } - }; - - const float32x4x4_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const auto pa = wrapper::vqmov(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const auto pb = wrapper::vqmov(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; - - *(output_ptr + x) = quantize((afs - bfs), out->info()->quantization_info()); - } - }, - input1, input2, output); - } -} - -void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat) -{ - ARM_COMPUTE_UNUSED(is_sat); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform(); - - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - - const float32x4x2_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), - } - }; - const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const float32x4x2_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); - - const float32x4x2_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; - - const float32x4x2_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), - } - }; - - const int32x4x2_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs - bfs), out->info()->quantization_info()); - } - }, - input1, input2, output); - } -} - -void sub_S16_U8_S16_impl(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat, bool is_swapped) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(!is_sat) - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = wrapper::vloadq(input1_ptr + x); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto res = is_swapped ? static_cast(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast(*(input2_ptr + x)); - *(output_ptr + x) = res; - } - } - else - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = wrapper::vloadq(input1_ptr + x); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto res = is_swapped ? wrapper::sub_sat(static_cast(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast(*(input2_ptr + x))); - *(output_ptr + x) = res; - } - } - }, - input1, input2, output); -} - -void sub_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat) -{ - sub_S16_U8_S16_impl(in1, in2, out, window, is_sat, false); -} - -void sub_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat) -{ - // Swap arguments - sub_S16_U8_S16_impl(in2, in1, out, window, is_sat, true); -} - -void sub_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(!is_sat) - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast(*(input1_ptr + x)) - static_cast(*(input2_ptr + x)); - } - } - else - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = wrapper::sub_sat(static_cast(*(input1_ptr + x)), - static_cast(*(input2_ptr + x))); - } - } - }, - input1, input2, output); -} - -inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy) -{ - ARM_COMPUTE_UNUSED(policy); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - - const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8) - && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8) - && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED) - && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16) - && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8) - && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16) - && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8) - && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16) - && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32) - && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32) - && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16), - "You called subtract with the wrong image formats"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP) - || (input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP) - || (input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP), - "Convert policy cannot be WRAP if datatype is quantized"); - - // Validate in case of configured output - if(output.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8) - && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8) - && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED) - && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16) - && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16) - && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16) - && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16) - && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16) - && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32) - && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32) - && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16), - "You called subtract with the wrong image formats"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), - "Wrong shape for output"); - } - return Status{}; -} -} // namespace - -NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel() - : _func(nullptr), _policy(ConvertPolicy::WRAP) -{ -} - -void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy)); - - _policy = policy; - - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); - const TensorShape &out_shape = broadcast_pair.first; - const ValidRegion &valid_region = broadcast_pair.second; - - // Auto initialize output if not initialized - set_shape_if_empty(*output, out_shape); - - switch(input1->data_type()) - { - case DataType::U8: - if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8) - { - _func = &sub_same; - } - else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) - { - _func = &sub_U8_U8_S16; - } - else - { - _func = &sub_U8_S16_S16; - } - break; - case DataType::QASYMM8: - _func = &sub_quantized; - set_data_type_if_unknown(*output, DataType::QASYMM8); - break; - case DataType::QASYMM8_SIGNED: - _func = &sub_quantized; - set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED); - break; - case DataType::S16: - if(input2->data_type() == DataType::U8) - { - _func = &sub_S16_U8_S16; - } - else - { - _func = &sub_same; - } - set_format_if_unknown(*output, Format::S16); - break; - case DataType::QSYMM16: - _func = &sub_QSYMM16_QSYMM16_QSYMM16; - set_data_type_if_unknown(*output, DataType::QSYMM16); - break; - case DataType::S32: - _func = &sub_same; - set_format_if_unknown(*output, Format::S32); - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = &sub_same; - set_format_if_unknown(*output, Format::F16); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - _func = &sub_same; - set_format_if_unknown(*output, Format::F32); - break; - default: - _func = nullptr; - } - - // NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(output->num_dimensions()); - output->set_valid_region(valid_region); - Window win = calculate_max_window(valid_region, Steps()); - - INEKernel::configure(win); -} - -Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy)); - - return Status{}; -} - -void NEArithmeticSubtractionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - // Dispatch kernel - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC_0), - tensors.get_const_tensor(TensorType::ACL_SRC_1), - tensors.get_tensor(TensorType::ACL_DST), - window, - (_policy == ConvertPolicy::SATURATE)); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h deleted file mode 100644 index 69952d6162..0000000000 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H -#define ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/NEON/INEKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** Interface for the kernel to perform subtraction between two tensors */ -class NEArithmeticSubtractionKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEArithmeticSubtractionKernel"; - } - /** Default constructor */ - NEArithmeticSubtractionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete; - /** Allow instances of this class to be moved */ - NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default; - /** Allow instances of this class to be moved */ - NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default; - /** Default destructor */ - ~NEArithmeticSubtractionKernel() = default; - - /** Initialise the kernel's input and output. - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. - * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. - */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - -private: - /** Common signature for all the specialised sub functions - * - * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. - * @param[in] window Region on which to execute the kernel. - * @param[in] is_sat Flag to indicate if the policy is SATURATE. - */ - using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat); - /** Sub function to use for the particular tensor types passed to configure() */ - SubFunction *_func; - ConvertPolicy _policy; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */ diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp new file mode 100644 index 0000000000..a03dcf2353 --- /dev/null +++ b/src/core/cpu/kernels/CpuSubKernel.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/cpu/kernels/CpuSubKernel.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" +#include "src/core/cpu/kernels/sub/neon/list.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct SubSelectorData +{ + DataType dt1; + DataType dt2; + DataType dt3; +}; + +using SubSelectorPtr = std::add_pointer::type; +using SubKernelPtr = std::add_pointer::type; + +struct SubKernel +{ + const char *name; + const SubSelectorPtr is_selected; + SubKernelPtr ukernel; +}; + +static const SubKernel available_kernels[] = +{ + { + "sub_same_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "sub_same_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); }, + REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "sub_same_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) + }, + { + "sub_same_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) + }, + { + "sub_same_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) + }, + { + "sub_u8_s16_s16_neon", + [](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon) + }, + { + "sub_s16_u8_s16_neon", + [](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon) + }, + { + "sub_u8_u8_s16_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon) + }, + { + "sub_qasymm8_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) + }, + { + "sub_qasymm8_signed_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) + }, + { + "sub_qsymm16_neon", + [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) + }, +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const SubKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected({ dt1, dt2, dt3 })) + { + return &uk; + } + } + return nullptr; +} + +inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + + const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type()); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8) + && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8) + && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED) + && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16) + && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8) + && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16) + && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8) + && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16) + && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32) + && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32) + && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16), + "You called subtract with the wrong image formats"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP) + || (src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP) + || (src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP), + "Convert policy cannot be WRAP if datatype is quantized"); + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8) + && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8) + && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED) + && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16) + && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16) + && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16) + && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16) + && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16) + && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32) + && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32) + && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16), + "You called subtract with the wrong image formats"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + return Status{}; +} +} // namespace + +void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); + + const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*src0, *src1); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize dst if not initialized + set_shape_if_empty(*dst, out_shape); + + _policy = policy; + + // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(dst->num_dimensions()); + dst->set_valid_region(valid_region); + Window win = calculate_max_window(valid_region, Steps()); + + ICpuKernel::configure(win); +} + +Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); + + return Status{}; +} + +void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + // Dispatch kernel + const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type()); + uk->ukernel(src0, src1, dst, _policy, window); +} + +const char *CpuSubKernel::name() const +{ + return "CpuSubKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h new file mode 100644 index 0000000000..da114b6e08 --- /dev/null +++ b/src/core/cpu/kernels/CpuSubKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H +#define ARM_COMPUTE_CPU_SUB_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform subtraction between two tensors */ +class CpuSubKernel : public ICpuKernel +{ +public: + CpuSubKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel); + + /** Initialise the kernel's src and dst. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[out] dst The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. + * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CpuSubKernel + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + ConvertPolicy _policy{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */ diff --git a/src/core/cpu/kernels/sub/neon/integer.cpp b/src/core/cpu/kernels/sub/neon/integer.cpp new file mode 100644 index 0000000000..bba73df1e8 --- /dev/null +++ b/src/core/cpu/kernels/sub/neon/integer.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void sub_s16_u8_s16_impl(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_swapped) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + if(policy == ConvertPolicy::WRAP) + { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto res = is_swapped ? static_cast(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast(*(input2_ptr + x)); + *(output_ptr + x) = res; + } + } + else + { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto res = is_swapped ? wrapper::sub_sat(static_cast(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast(*(input2_ptr + x))); + *(output_ptr + x) = res; + } + } + }, + input1, input2, output); +} +} + +void sub_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + sub_s16_u8_s16_impl(src1, src0, dst, policy, window, false); +} + +void sub_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + // Swap arguments + sub_s16_u8_s16_impl(src1, src0, dst, policy, window, true); +} + +void sub_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + if(policy == ConvertPolicy::WRAP) + { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast(*(input1_ptr + x)) - static_cast(*(input2_ptr + x)); + } + } + else + { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = wrapper::sub_sat(static_cast(*(input1_ptr + x)), + static_cast(*(input2_ptr + x))); + } + } + }, + input1, input2, output); +} + +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h new file mode 100644 index 0000000000..d5685824fc --- /dev/null +++ b/src/core/cpu/kernels/sub/neon/list.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H +#define SRC_CORE_NEON_KERNELS_SUB_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SUB_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) + +DECLARE_SUB_KERNEL(sub_qasymm8_neon); +DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon); +DECLARE_SUB_KERNEL(sub_qsymm16_neon); +DECLARE_SUB_KERNEL(sub_s16_u8_s16_neon); +DECLARE_SUB_KERNEL(sub_u8_s16_s16_neon); +DECLARE_SUB_KERNEL(sub_u8_u8_s16_neon); + +#undef DECLARE_SUB_KERNEL + +template +void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + + bool is_sat = policy == ConvertPolicy::SATURATE; + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v); + if(is_broadcast_input_2) + { + res = wrapper::vmul(res, wrapper::vdup_n(static_cast(-1), ExactTagType{})); + } + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; + if(is_broadcast_input_2) + { + res = static_cast(-1) * res; + } + + *(output_ptr + x) = res; + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/qasymm8.cpp b/src/core/cpu/kernels/sub/neon/qasymm8.cpp new file mode 100644 index 0000000000..8f4cd8bdbb --- /dev/null +++ b/src/core/cpu/kernels/sub/neon/qasymm8.cpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); + const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); + const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); + const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(static_cast(broadcast_value), wrapper::traits::vector_128_tag{}); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64_ + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qasymm8(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); + const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; + const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; + + *(output_ptr + x) = quantize_qasymm8((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} + +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..2c9e411743 --- /dev/null +++ b/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); + const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); + const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); + const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(static_cast(broadcast_value), wrapper::traits::vector_128_tag{}); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64_ + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); + const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; + const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; + + *(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/qsymm16.cpp b/src/core/cpu/kernels/sub/neon/qsymm16.cpp new file mode 100644 index 0000000000..4dfdc0e78c --- /dev/null +++ b/src/core/cpu/kernels/sub/neon/qsymm16.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const float32x4x2_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), + } + }; + const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const float32x4x2_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const float32x4x2_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + } + }; + + const float32x4x2_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), + } + }; + + const int32x4x2_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 512cfd6f70..0263d4cbb6 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,35 +24,18 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/core/ITensor.h" -#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h" +#include "src/runtime/cpu/operators/CpuSub.h" #include namespace arm_compute { -namespace experimental -{ -void NEArithmeticSubtraction::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(input1, input2, output, policy); - _kernel = std::move(k); -} - -Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy); -} -} // namespace experimental - struct NEArithmeticSubtraction::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; NEArithmeticSubtraction::NEArithmeticSubtraction() @@ -65,7 +48,7 @@ NEArithmeticSubtraction::~NEArithmeticSubtraction() Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) { - return experimental::NEArithmeticSubtraction::validate(input1, input2, output, policy, act_info); + return cpu::CpuSub::validate(input1, input2, output, policy, act_info); } void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) @@ -73,7 +56,7 @@ void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *in _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info); } diff --git a/src/runtime/cpu/operators/CpuSub.cpp b/src/runtime/cpu/operators/CpuSub.cpp new file mode 100644 index 0000000000..9baaaa9d67 --- /dev/null +++ b/src/runtime/cpu/operators/CpuSub.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuSub.h" + +#include "src/core/cpu/kernels/CpuSubKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + auto k = std::make_unique(); + k->configure(src0, src1, dst, policy); + _kernel = std::move(k); +} + +Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuSubKernel::validate(src0, src1, dst, policy); +} +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuSub.h b/src/runtime/cpu/operators/CpuSub.h new file mode 100644 index 0000000000..099ffef87e --- /dev/null +++ b/src/runtime/cpu/operators/CpuSub.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SUB_H +#define ARM_COMPUTE_CPU_SUB_H + +#include "src/runtime/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuSubKernel */ +class CpuSub : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuSub + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 + * @param[in] src1 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 + * @param[in] dst Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SUB_H */ \ No newline at end of file -- cgit v1.2.1