From d5c496d87e3b446532dd3dd163e9768de0daff4e Mon Sep 17 00:00:00 2001 From: Dana Zlotnik Date: Sun, 28 Nov 2021 14:46:12 +0200 Subject: Decouple CpuElementwiseKernel 1- reorganize the folders struct according the new definition 2- separate between unary and binary implementations 3- decuple kernels - unary , binary op and binary comparision Resolves COMPMID-4634 Change-Id: I0195846cc372e74a63c659069a4508de53a22110 Signed-off-by: Dana Zlotnik Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6860 Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins --- src/cpu/kernels/CpuElementwiseKernel.cpp | 96 +- src/cpu/kernels/CpuElementwiseUnaryKernel.cpp | 15 +- .../kernels/elementwise/neon/elementwise_list.h | 486 --------- .../elementwise/neon/elementwise_quantized_list.h | 654 ------------ .../elementwise/neon/elementwise_unary_list.h | 116 -- src/cpu/kernels/elementwise/sve/elementwise.cpp | 311 ------ src/cpu/kernels/elementwise/sve/elementwise_list.h | 171 --- .../elementwise/sve/elementwise_quantized_list.h | 366 ------- .../kernels/elementwise/sve/elementwise_unary.cpp | 113 -- .../elementwise/sve/elementwise_unary_list.h | 39 - .../elementwise_binary/generic/neon/fp16.cpp | 61 ++ .../elementwise_binary/generic/neon/fp32.cpp | 58 + .../kernels/elementwise_binary/generic/neon/impl.h | 1103 ++++++++++++++++++++ .../elementwise_binary/generic/neon/integer.cpp | 95 ++ .../elementwise_binary/generic/neon/qasymm8.cpp | 59 ++ .../generic/neon/qasymm8_signed.cpp | 60 ++ .../elementwise_binary/generic/sve/fp16.cpp | 61 ++ .../elementwise_binary/generic/sve/fp32.cpp | 60 ++ .../elementwise_binary/generic/sve/impl.cpp | 315 ++++++ .../kernels/elementwise_binary/generic/sve/impl.h | 165 +++ .../elementwise_binary/generic/sve/integer.cpp | 97 ++ .../kernels/elementwise_binary/generic/sve2/impl.h | 363 +++++++ .../elementwise_binary/generic/sve2/qasymm8.cpp | 61 ++ .../generic/sve2/qasymm8_signed.cpp | 61 ++ src/cpu/kernels/elementwise_binary/list.h | 72 ++ .../elementwise_unary/generic/neon/fp16.cpp | 38 + .../elementwise_unary/generic/neon/fp32.cpp | 36 + .../kernels/elementwise_unary/generic/neon/impl.h | 116 ++ .../elementwise_unary/generic/neon/integer.cpp | 36 + .../kernels/elementwise_unary/generic/sve/fp16.cpp | 38 + .../kernels/elementwise_unary/generic/sve/fp32.cpp | 38 + .../kernels/elementwise_unary/generic/sve/impl.cpp | 109 ++ .../kernels/elementwise_unary/generic/sve/impl.h | 36 + .../elementwise_unary/generic/sve/integer.cpp | 38 + src/cpu/kernels/elementwise_unary/list.h | 48 + 35 files changed, 3276 insertions(+), 2315 deletions(-) delete mode 100644 src/cpu/kernels/elementwise/neon/elementwise_list.h delete mode 100644 src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h delete mode 100644 src/cpu/kernels/elementwise/neon/elementwise_unary_list.h delete mode 100644 src/cpu/kernels/elementwise/sve/elementwise.cpp delete mode 100644 src/cpu/kernels/elementwise/sve/elementwise_list.h delete mode 100644 src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h delete mode 100644 src/cpu/kernels/elementwise/sve/elementwise_unary.cpp delete mode 100644 src/cpu/kernels/elementwise/sve/elementwise_unary_list.h create mode 100644 src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/neon/impl.h create mode 100644 src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve/impl.h create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve2/impl.h create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp create mode 100644 src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp create mode 100644 src/cpu/kernels/elementwise_binary/list.h create mode 100644 src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp create mode 100644 src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp create mode 100644 src/cpu/kernels/elementwise_unary/generic/neon/impl.h create mode 100644 src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp create mode 100644 src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp create mode 100644 src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp create mode 100644 src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp create mode 100644 src/cpu/kernels/elementwise_unary/generic/sve/impl.h create mode 100644 src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp create mode 100644 src/cpu/kernels/elementwise_unary/list.h (limited to 'src/cpu') diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp index 91de24b850..53179ae95f 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,10 +28,7 @@ #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/cpu/kernels/elementwise/neon/elementwise_list.h" -#include "src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h" +#include "src/cpu/kernels/elementwise_binary/list.h" #include @@ -68,76 +65,73 @@ CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, { "sve_fp32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op)) + REGISTER_FP32_SVE((arm_compute::cpu::sve_fp32_elementwise_binary)) }, { "sve_s32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op)) + REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s32_elementwise_binary)) }, { "sve_s16_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op)) + REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s16_elementwise_binary)) + }, + { + "sve_fp16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE((arm_compute::cpu::sve_fp16_elementwise_binary)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp32_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op>)) + REGISTER_FP32_NEON((arm_compute::cpu::neon_fp32_elementwise_binary)) }, { "neon_s32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>)) + REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s32_elementwise_binary)) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, + REGISTER_FP16_NEON((arm_compute::cpu::neon_fp16_elementwise_binary)) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_s16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s16_elementwise_binary)) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { "sve2_qu8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op)) + REGISTER_QASYMM8_SVE2((arm_compute::cpu::sve2_qasymm8_elementwise_binary)) }, { "sve2_qs8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op)) + REGISTER_QASYMM8_SIGNED_SVE2((arm_compute::cpu::sve2_qasymm8_signed_elementwise_binary)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { "neon_qu8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized)) + REGISTER_QASYMM8_NEON((arm_compute::cpu::neon_qasymm8_elementwise_binary)) }, { "neon_qs8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed)) + REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::neon_qasymm8_signed_elementwise_binary)) }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op>)) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_s16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ }; for(const auto &uk : kernels) @@ -161,82 +155,82 @@ CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, c { "sve_u8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) + REGISTER_INTEGER_SVE(arm_compute::cpu::sve_u8_comparison_elementwise_binary) }, { "sve_fp32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op)) + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_comparison_elementwise_binary) }, { "sve_s16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) + REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s16_comparison_elementwise_binary) }, { "sve_s32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) + REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s32_comparison_elementwise_binary) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_u8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8)) + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_comparison_elementwise_binary) }, { "neon_fp32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32)) + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_comparison_elementwise_binary) }, { "neon_s16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16)) + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_comparison_elementwise_binary) }, { "neon_s32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32)) + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_comparison_elementwise_binary) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { "sve2_qu8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op)) + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_comparison_elementwise_binary) }, { "sve2_qs8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op)) + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_comparison_elementwise_binary) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { "neon_qu8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized)) + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_comparison_elementwise_binary) }, { "neon_qs8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed)) + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_comparison_elementwise_binary) }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON ||ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_SVE) { "sve_fp16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op)) + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_comparison_elementwise_binary) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_fp16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16)) + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_comparison_elementwise_binary) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ }; diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 61bc64b235..f3a82c23f0 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -31,8 +31,7 @@ #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/cpu/kernels/elementwise/neon/elementwise_unary_list.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_unary_list.h" +#include "src/cpu/kernels/elementwise_unary/list.h" #include "support/ToolchainSupport.h" namespace arm_compute @@ -52,7 +51,7 @@ static const std::vector avai { return data.dt == DataType::F32 && data.isa.sve; }, - REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op) + REGISTER_FP32_SVE(sve_fp32_elementwise_unary) }, { "sve_fp16_elementwise_unary", @@ -60,31 +59,31 @@ static const std::vector avai { return (data.dt == DataType::F16) && data.isa.sve; }, - REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>), + REGISTER_FP16_SVE(sve_fp16_elementwise_unary), }, { "sve_s32_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::S32 && data.isa.sve; }, - REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op), + REGISTER_INTEGER_SVE(sve_s32_elementwise_unary), }, #endif // defined(ARM_COMPUTE_ENABLE_SVE) #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp32_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op), + REGISTER_FP32_NEON(neon_fp32_elementwise_unary), }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_fp16_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>), + REGISTER_FP32_NEON(neon_fp16_elementwise_unary), }, #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_s32_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op), + REGISTER_INTEGER_NEON(neon_s32_elementwise_unary), }, #endif // defined(ARM_COMPUTE_ENABLE_NEON) }; diff --git a/src/cpu/kernels/elementwise/neon/elementwise_list.h b/src/cpu/kernels/elementwise/neon/elementwise_list.h deleted file mode 100644 index 43e44be5e2..0000000000 --- a/src/cpu/kernels/elementwise/neon/elementwise_list.h +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H - -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -template -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = std::min(16 / static_cast(sizeof(OutputScalarType)), 8); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); - for(; x < window_end_x; ++x) - { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); - } -} - -template -inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) -{ - auto res = ScalarType(0); - - switch(op) - { - case ArithmeticOperation::MAX: - res = std::max(a, b); - break; - case ArithmeticOperation::MIN: - res = std::min(a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - res = (a - b) * (a - b); - break; - } - case ArithmeticOperation::PRELU: - { - res = (a > 0 ? a : a * b); - break; - } - case ArithmeticOperation::DIV: - { - res = a / b; - if(std::is_integral::value) - { - res = (b == 0) ? 0 : res; - if(static_cast(a) % static_cast(b) != 0 && ((a < 0) != (b < 0))) - { - --res; - } - } - break; - } - case ArithmeticOperation::POWER: - { - res = std::pow(a, b); - break; - } - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template -inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) -{ - using vec_type = typename VectorType::type; - using scalar_type = typename VectorType::scalar_type; - using tag_type = typename VectorType::tag_type; - - vec_type res = wrapper::vdup_n(static_cast(0), tag_type{}); - - switch(op) - { - case ArithmeticOperation::MAX: - res = wrapper::vmax(a, b); - break; - case ArithmeticOperation::MIN: - res = wrapper::vmin(a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - const vec_type tmp = wrapper::vsub(a, b); - res = wrapper::vmul(tmp, tmp); - break; - } - case ArithmeticOperation::PRELU: - { - const vec_type zero = wrapper::vdup_n(static_cast(0), tag_type{}); - const vec_type tmp = wrapper::vmul(a, b); - const auto gt = wrapper::vcgt(a, zero); - - res = wrapper::vbsl(gt, a, tmp); - break; - } - - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template <> -inline int32x4_t elementwise_arithm_op>(const int32x4_t &a, const int32x4_t &b) -{ - return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); -} - -template <> -inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) -{ - return wrapper::vdiv(a, b); -} - -template <> -inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) -{ - return wrapper::vpow(a, b); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) -{ - return wrapper::vdiv(a, b); -} - -template <> -inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) -{ - return wrapper::vpow(a, b); -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template -inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) -{ - using tag_type = typename VectorType::tag_type; - using vec_type = typename VectorType::type; - - vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); - return elementwise_arithm_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template -inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, elementwise_arithm_op(a, b)); - } - return x; -} - -template -inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast(a, broadcast_value, reorder)); - } - return x; -} - -template -void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using scalar_type = typename VectorType::scalar_type; - - elementwise_op(in1, in2, out, window, - &elementwise_arithm_op_scalar, - &elementwise_arithm_op_broadcast_loop, - &elementwise_arithm_op_loop); -} - -template -inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) -{ - bool res = false; - - switch(op) - { - case ComparisonOperation::Equal: - res = (a == b); - break; - case ComparisonOperation::NotEqual: - res = (a != b); - break; - case ComparisonOperation::Greater: - res = (a > b); - break; - case ComparisonOperation::GreaterEqual: - res = (a >= b); - break; - case ComparisonOperation::Less: - res = (a < b); - break; - case ComparisonOperation::LessEqual: - res = (a <= b); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res ? ~static_cast(0) : static_cast(0); -} - -template -inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) -{ - OutputVectorType res = { 0, 0, 0, 0 }; - - switch(op) - { - case ComparisonOperation::Equal: - res = wrapper::vceq(a, b); - break; - case ComparisonOperation::NotEqual: - res = wrapper::vnot(wrapper::vceq(a, b)); - break; - case ComparisonOperation::Greater: - res = wrapper::vcgt(a, b); - break; - case ComparisonOperation::GreaterEqual: - res = wrapper::vcge(a, b); - break; - case ComparisonOperation::Less: - res = wrapper::vcgt(b, a); - break; - case ComparisonOperation::LessEqual: - res = wrapper::vcge(b, a); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template -inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) -{ - InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_comp_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template -inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, a); - } - return x; -} - -template -inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); - } - return x; -} - -template -inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); - const auto b = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); - } - if(x <= window_end_x - 4) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - for(int i = 0; i < 4; i++) - { - *(output_ptr + x + i) = wrapper::vgetlane(a, i); - } - x = +4; - } - return x; -} - -template -inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - wrapper::vstore(output_ptr + x, res); - } - return x; -} - -template -inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); - } - return x; -} - -template -inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto a = wrapper::vloadq(input1_ptr + x); - auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - a = wrapper::vloadq(input1_ptr + x + 4); - b = wrapper::vloadq(input2_ptr + x + 4); - const auto res2 = elementwise_comp_op(a, b); - wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); - } - if(x <= window_end_x - 4) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - for(int i = 0; i < 4; i++) - { - *(output_ptr + x + i) = wrapper::vgetlane(res, i); - } - x = +4; - } - return x; -} - -template -void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_8_loop, - &elementwise_comp_op_8_loop); -} - -template -void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_16_loop, - &elementwise_comp_op_16_loop); -} - -template -void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_32_loop, - &elementwise_comp_op_32_loop); -} -} // namesapce cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */ \ No newline at end of file diff --git a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h deleted file mode 100644 index 3b4c112770..0000000000 --- a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h +++ /dev/null @@ -1,654 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H - -#include "src/cpu/kernels/elementwise/neon/elementwise_list.h" - -namespace arm_compute -{ -namespace cpu -{ -float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) -{ - qasymm8x16_t x = vld1q_u8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - } - }; - return out; -} - -float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) -{ - qasymm8x16_signed_t x = vld1q_s8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - } - }; - return out; -} - -void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) -{ - const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); - const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3]))); - vst1q_u8(output_ptr, vcombine_u8(pa, pb)); -} - -void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) -{ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); - vst1q_u8(output_ptr, vcombine_u8(pa, pb)); -} - -void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) -{ - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; - store_quantized(output_ptr, out); -} - -void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) -{ - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); - vst1q_s8(output_ptr, vcombine_s8(pa, pb)); -} - -void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) -{ - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; - store_quantized_signed(output_ptr, out); -} - -template -inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - return quantize_qasymm8(elementwise_arithm_op_scalar(a, b), qinfo); -} - -template -inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - return quantize_qasymm8_signed(elementwise_arithm_op_scalar(a, b), qinfo); -} - -template -inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) -{ - using neon_vector_float = wrapper::traits::neon_vector; - float32x4x4_t out = - { - { - elementwise_arithm_op(a.val[0], b.val[0]), - elementwise_arithm_op(a.val[1], b.val[1]), - elementwise_arithm_op(a.val[2], b.val[2]), - elementwise_arithm_op(a.val[3], b.val[3]), - } - }; - return out; -} - -template -inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - ARM_COMPUTE_UNUSED(qinfo); - return elementwise_comp_op_scalar(a, b); -} - -template -inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) -{ - uint32x4x4_t out = - { - { - elementwise_comp_op(a.val[0], b.val[0]), - elementwise_comp_op(a.val[1], b.val[1]), - elementwise_comp_op(a.val[2], b.val[2]), - elementwise_comp_op(a.val[3], b.val[3]) - } - }; - return out; -} - -template -inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get inputs and compute output - const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); - const float32x4x4_t rf = elementwise_arithm_op(af, bf); - store_quantized(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get inputs and compute output - const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); - const float32x4x4_t rf = elementwise_arithm_op(af, bf); - store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} -template -inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); - const uint32x4x4_t rf = elementwise_comp_op(af, bf); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); - const uint32x4x4_t rf = elementwise_comp_op(af, bf); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf); - } - return x; -} - -void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if(is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if(is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if(is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -template -void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar, - &elementwise_arithm_op_quantized_broadcast_loop, - &elementwise_arithm_op_quantized_loop); -} -template -void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar, - &elementwise_arithm_op_quantized_signed_broadcast_loop, - &elementwise_arithm_op_quantized_singed_loop); -} - -template -void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_broadcast_loop, - &elementwise_comp_op_quantized_loop); -} - -template -void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_signed_broadcast_loop, - &elementwise_comp_op_quantized_signed_loop); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ diff --git a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h deleted file mode 100644 index 307e95fae9..0000000000 --- a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H - -#include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" - -namespace arm_compute -{ -namespace cpu -{ -template -inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return 1 / sqrt(a); - case ElementWiseUnary::EXP: - return std::exp(a); - case ElementWiseUnary::NEG: - return -a; - case ElementWiseUnary::LOG: - return std::log(a); - case ElementWiseUnary::ABS: - return std::abs(a); - case ElementWiseUnary::ROUND: - return support::cpp11::nearbyint(a); - case ElementWiseUnary::SIN: - return std::sin(a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -template -inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return wrapper::vinvsqrt(a); - case ElementWiseUnary::EXP: - return wrapper::vexpq(a); - case ElementWiseUnary::NEG: - return wrapper::vneg(a); - case ElementWiseUnary::LOG: - return wrapper::vlog(a); - case ElementWiseUnary::ABS: - return wrapper::vabs(a); - case ElementWiseUnary::ROUND: - return wrapper::vround(a); - case ElementWiseUnary::SIN: - return wrapper::vsin(a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -template -void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) -{ - const int window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(output_ptr + x, elementwise_op_imp(op, wrapper::vloadq(input_ptr + x))); - } - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); - } - }, - input, output); -} - -} // namespace cpu -} // namespace arm_compute - -#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise.cpp b/src/cpu/kernels/elementwise/sve/elementwise.cpp deleted file mode 100644 index 2f9a7998df..0000000000 --- a/src/cpu/kernels/elementwise/sve/elementwise.cpp +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::wrapper; - -template -struct LoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; -}; - -template -struct BroadcastLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - InputScalarType broadcast_value; - OutputScalarType *output_ptr; - bool reorder; -}; - -template -void arithmetic_op_loop(svbool_t pg, const LoopArguments &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template -void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template -void comparison_op_loop(svbool_t pg, const LoopArguments &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, args.output_ptr, res); -} - -template -void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, args.output_ptr, res); -} - -template -using LoopFuncType = void (*)(svbool_t, const LoopArguments &); - -template -using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments &); - -template ::type, - typename OutputScalarType = typename sve_scalar::type> -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopFuncType func, - BroadcastLoopFuncType broadcast_func) -{ - const auto all_true_pg = svptrue(); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do - { - broadcast_func(pg, - { - op, - non_broadcast_input_ptr + x, - broadcast_value, - output_ptr + x, - !is_broadcast_input_2 - }); - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do - { - func(pg, - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x - }); - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - -template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename sve_vector::type; - - elementwise_op(in1, in2, out, window, op, - &arithmetic_op_loop, - &arithmetic_op_broadcast_loop); -} - -template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename sve_vector::type; - using OutputVectorType = typename sve_vector::type; - - elementwise_op(in1, in2, out, window, op, - &comparison_op_loop, - &comparison_op_broadcast_loop); -} - -template <> -svint32_t elementwise_pow(svbool_t &pg, const svint32_t &a, const svint32_t &b) -{ - return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); -} - -template <> -svint32_t elementwise_div(svbool_t &pg, const svint32_t &a, const svint32_t &b) -{ - return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); -} - -template <> -svint16_t elementwise_div(svbool_t &pg, const svint16_t &a, const svint16_t &b) -{ - ARM_COMPUTE_UNUSED(pg, a, b); - ARM_COMPUTE_ERROR("Not supported"); -} - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise_list.h b/src/cpu/kernels/elementwise/sve/elementwise_list.h deleted file mode 100644 index f762587ce7..0000000000 --- a/src/cpu/kernels/elementwise/sve/elementwise_list.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H -#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/NEON/wrapper/svtraits.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::wrapper; - -template -VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) -{ - return svpow_z(pg, a, b); -} - -template -VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) -{ - return svdiv_z(pg, a, b); -} - -template -svbool_t narrow_to_byte_predicate(svbool_t pg) -{ - const auto all_false = svpfalse(); - - switch(bytewidth) - { - case 8: - pg = svuzp1_b32(pg, all_false); - /* fall through */ - case 4: - pg = svuzp1_b16(pg, all_false); - /* fall through */ - case 2: - pg = svuzp1_b8(pg, all_false); - /* fall through */ - default: - break; - } - return pg; -} - -template -VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) -{ - using ScalarType = typename wrapper::sve_scalar::type; - VectorType res{}; - - switch(op) - { - case ArithmeticOperation::MAX: - res = svmax_z(pg, a, b); - break; - case ArithmeticOperation::MIN: - res = svmin_z(pg, a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - const auto tmp = svsub_z(pg, a, b); - res = svmul_z(pg, tmp, tmp); - break; - } - case ArithmeticOperation::PRELU: - { - const auto zero = svdup_n(ScalarType(0)); - const auto tmp = svmul_z(pg, a, b); - const auto gt = svcmpgt(pg, a, zero); - res = svsel(gt, a, tmp); - break; - } - case ArithmeticOperation::DIV: - { - res = elementwise_div(pg, a, b); - break; - } - case ArithmeticOperation::POWER: - { - res = elementwise_pow(pg, a, b); - break; - } - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template -OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) -{ - svbool_t selection_vector{}; - - switch(op) - { - case ComparisonOperation::Equal: - selection_vector = svcmpeq(pg, a, b); - break; - case ComparisonOperation::NotEqual: - selection_vector = svcmpne(pg, a, b); - break; - case ComparisonOperation::Greater: - selection_vector = svcmpgt(pg, a, b); - break; - case ComparisonOperation::GreaterEqual: - selection_vector = svcmpge(pg, a, b); - break; - case ComparisonOperation::Less: - selection_vector = svcmplt(pg, a, b); - break; - case ComparisonOperation::LessEqual: - selection_vector = svcmple(pg, a, b); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - using InputScalarType = typename wrapper::sve_scalar::type; - selection_vector = narrow_to_byte_predicate(selection_vector); - - using OutputScalarType = typename wrapper::sve_scalar::type; - const auto false_vector = svdup_n(static_cast((uint32_t)0)); - const auto true_vector = svdup_n(static_cast(~(uint32_t)0)); - auto ret = svsel(selection_vector, true_vector, false_vector); - - return ret; -} - -template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h deleted file mode 100644 index a5d17a86a7..0000000000 --- a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H -#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H - -#if defined(ARM_COMPUTE_ENABLE_SVE2) - -#include "src/core/NEON/wrapper/svtraits.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::wrapper; - -template -struct QuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; - - const svint32_t &in1_offset; - const svint32_t &in2_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &in2_scale; - const svfloat32_t &out_scale; -}; - -template -struct BroadcastQuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - float broadcast_value; - OutputScalarType *output_ptr; - bool reorder; - - const svint32_t &in1_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &out_scale; -}; - -svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) -{ - auto x = svld1(pg, ptr); - - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); - - pg = svptrue_b8(); - - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); -} - -svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) -{ - auto x = svld1(pg, ptr); - - //vprint(x); - - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); - - pg = svptrue_b8(); - - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); -} - -void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) -{ - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); - - const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); - const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); - const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); - svst1(pg, ptr, narrowed); -} - -void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) -{ - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); - - const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); - const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); - const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); - - svst1(pg, ptr, narrowed); -} - -template -inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); - - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); - - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} - -template -inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); - - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; - - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_arithmetic_op(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_arithmetic_op(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_arithmetic_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); - - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} - -template -inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); - - using OutputVectorType = typename wrapper::traits::sve_vector::type; - - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} - -template -inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); - - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; - - using OutputVectorType = typename wrapper::traits::sve_vector::type; - - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_comparison_op(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_comparison_op(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_comparison_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} - -template -using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments &); - -template -using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments &); - -template ::type, - typename OutputScalarType = typename wrapper::sve_scalar::type> -void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopQuantizedFuncType func, - BroadcastQuantizedLoopFuncType broadcast_func) -{ - const auto all_true_pg = wrapper::svptrue(); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); - const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); - const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); - - const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); - const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto args = BroadcastQuantizedLoopArguments - { - op, - non_broadcast_input_ptr + x, - Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo), - output_ptr + x, - !is_broadcast_input_2, - non_broadcast_voffset, output_voffset, - non_broadcast_vscale, output_vscale - }; - broadcast_func(pg, args); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); - const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); - - const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); - const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto args = QuantizedLoopArguments - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x, - in1_voffset, in2_voffset, output_voffset, - in1_vscale, in2_vscale, output_vscale - }; - func(pg, args); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - -template -void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename wrapper::traits::sve_vector::type; - elementwise_quantized_op(in1, in2, out, window, op, - &arithmetic_op_quantized_loop, - &arithmetic_op_broadcast_quantized_loop); -} - -template -void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename wrapper::traits::sve_vector::type; - using OutputVectorType = typename wrapper::traits::sve_vector::type; - elementwise_quantized_op(in1, in2, out, window, op, - &comparison_op_quantized_loop, - &comparison_op_broadcast_quantized_loop); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ \ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp deleted file mode 100644 index ddf1febd66..0000000000 --- a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return svinvsqrt(pg, a); - case ElementWiseUnary::EXP: - return wrapper::svexp_z(pg, a); - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::LOG: - return wrapper::svlog_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - case ElementWiseUnary::ROUND: - return svrintn_z(pg, a); - case ElementWiseUnary::SIN: - return wrapper::svsin_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - -template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - -template -void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto vin = svld1(pg, input_ptr + x); - svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input, output); -} - -template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h deleted file mode 100644 index c2b495f27c..0000000000 --- a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H -#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H - -#include "arm_compute/core/Types.h" -#if defined(ARM_COMPUTE_ENABLE_SVE) - -namespace arm_compute -{ -namespace cpu -{ -template -void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -} // namespace cpu -} // namespace arm_compute -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp new file mode 100644 index 0000000000..6091ef215e --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template +void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op>(in1, in2, out, window); +} + +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_16(in1, in2, out, window); +} + +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp new file mode 100644 index 0000000000..2d8fec91c5 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template +void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op>(in1, in2, out, window); +} + +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_32(in1, in2, out, window); +} +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h new file mode 100644 index 0000000000..ead54ab14e --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -0,0 +1,1103 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H + +#include "src/core/NEON/NEAsymm.h" + +namespace arm_compute +{ +namespace cpu +{ +template +typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) +{ + using vec_type = typename VectorType::type; + using scalar_type = typename VectorType::scalar_type; + using tag_type = typename VectorType::tag_type; + + vec_type res = wrapper::vdup_n(static_cast(0), tag_type{}); + + switch(op) + { + case ArithmeticOperation::MAX: + res = wrapper::vmax(a, b); + break; + case ArithmeticOperation::MIN: + res = wrapper::vmin(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const vec_type tmp = wrapper::vsub(a, b); + res = wrapper::vmul(tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const vec_type zero = wrapper::vdup_n(static_cast(0), tag_type{}); + const vec_type tmp = wrapper::vmul(a, b); + const auto gt = wrapper::vcgt(a, zero); + + res = wrapper::vbsl(gt, a, tmp); + break; + } + + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} +template +typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) +{ + using tag_type = typename VectorType::tag_type; + using vec_type = typename VectorType::type; + + vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); + return elementwise_arithm_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); + for(; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +template +inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch(op) + { + case ArithmeticOperation::MAX: + res = std::max(a, b); + break; + case ArithmeticOperation::MIN: + res = std::min(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + res = (a - b) * (a - b); + break; + } + case ArithmeticOperation::PRELU: + { + res = (a > 0 ? a : a * b); + break; + } + case ArithmeticOperation::DIV: + { + res = a / b; + if(std::is_integral::value) + { + res = (b == 0) ? 0 : res; + if(static_cast(a) % static_cast(b) != 0 && ((a < 0) != (b < 0))) + { + --res; + } + } + break; + } + case ArithmeticOperation::POWER: + { + res = std::pow(a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <> +inline int32x4_t elementwise_arithm_op>(const int32x4_t &a, const int32x4_t &b) +{ + return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); +} + +template <> +inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) +{ + return wrapper::vpow(a, b); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vpow(a, b); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template +inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_arithm_op(a, b)); + } + return x; +} + +template +inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast(a, broadcast_value, reorder)); + } + return x; +} + +template +void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using scalar_type = typename VectorType::scalar_type; + + elementwise_op(in1, in2, out, window, + &elementwise_arithm_op_scalar, + &elementwise_arithm_op_broadcast_loop, + &elementwise_arithm_op_loop); +} + +template +inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) +{ + bool res = false; + + switch(op) + { + case ComparisonOperation::Equal: + res = (a == b); + break; + case ComparisonOperation::NotEqual: + res = (a != b); + break; + case ComparisonOperation::Greater: + res = (a > b); + break; + case ComparisonOperation::GreaterEqual: + res = (a >= b); + break; + case ComparisonOperation::Less: + res = (a < b); + break; + case ComparisonOperation::LessEqual: + res = (a <= b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res ? ~static_cast(0) : static_cast(0); +} + +template +inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) +{ + OutputVectorType res = { 0, 0, 0, 0 }; + + switch(op) + { + case ComparisonOperation::Equal: + res = wrapper::vceq(a, b); + break; + case ComparisonOperation::NotEqual: + res = wrapper::vnot(wrapper::vceq(a, b)); + break; + case ComparisonOperation::Greater: + res = wrapper::vcgt(a, b); + break; + case ComparisonOperation::GreaterEqual: + res = wrapper::vcge(a, b); + break; + case ComparisonOperation::Less: + res = wrapper::vcgt(b, a); + break; + case ComparisonOperation::LessEqual: + res = wrapper::vcge(b, a); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template +inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) +{ + InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_comp_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template +inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, a); + } + return x; +} + +template +inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); + } + return x; +} + +template +inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); + const auto b = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); + } + if(x <= window_end_x - 4) + { + const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + for(int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(a, i); + } + x = +4; + } + return x; +} + +template +inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op(a, b); + wrapper::vstore(output_ptr + x, res); + } + return x; +} + +template +inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); + } + return x; +} + +template +inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto a = wrapper::vloadq(input1_ptr + x); + auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op(a, b); + a = wrapper::vloadq(input1_ptr + x + 4); + b = wrapper::vloadq(input2_ptr + x + 4); + const auto res2 = elementwise_comp_op(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); + } + if(x <= window_end_x - 4) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op(a, b); + for(int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(res, i); + } + x = +4; + } + return x; +} + +template +void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op(in1, in2, out, window, + &elementwise_comp_op_scalar, + &elementwise_comp_op_broadcast_8_loop, + &elementwise_comp_op_8_loop); +} + +template +void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op(in1, in2, out, window, + &elementwise_comp_op_scalar, + &elementwise_comp_op_broadcast_16_loop, + &elementwise_comp_op_16_loop); +} + +template +void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op(in1, in2, out, window, + &elementwise_comp_op_scalar, + &elementwise_comp_op_broadcast_32_loop, + &elementwise_comp_op_32_loop); +} + +inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), + } + }; + return out; +} + +inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_signed_t x = vld1q_s8(input1_ptr); + const float32x4x4_t out = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + } + }; + return out; +} + +inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) +{ + const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); + const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) +{ + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + int32x4x4_t out = + { + { + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + } + }; + store_quantized(output_ptr, out); +} + +inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) +{ + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_s8(output_ptr, vcombine_s8(pa, pb)); +} + +inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + int32x4x4_t out = + { + { + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + } + }; + store_quantized_signed(output_ptr, out); +} + +template +inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8(elementwise_arithm_op_scalar(a, b), qinfo); +} + +template +inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8_signed(elementwise_arithm_op_scalar(a, b), qinfo); +} + +template +float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + using neon_vector_float = wrapper::traits::neon_vector; + float32x4x4_t out = + { + { + elementwise_arithm_op(a.val[0], b.val[0]), + elementwise_arithm_op(a.val[1], b.val[1]), + elementwise_arithm_op(a.val[2], b.val[2]), + elementwise_arithm_op(a.val[3], b.val[3]), + } + }; + return out; +} + +template +inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + ARM_COMPUTE_UNUSED(qinfo); + return elementwise_comp_op_scalar(a, b); +} + +template +inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + uint32x4x4_t out = + { + { + elementwise_comp_op(a.val[0], b.val[0]), + elementwise_comp_op(a.val[1], b.val[1]), + elementwise_comp_op(a.val[2], b.val[2]), + elementwise_comp_op(a.val[3], b.val[3]) + } + }; + return out; +} + +template +inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op(af, bf); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template +inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op(af, bf); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} +template +inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template +inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const uint32x4x4_t rf = elementwise_comp_op(af, bf); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template +inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const uint32x4x4_t rf = elementwise_comp_op(af, bf); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template +inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf); + } + return x; +} + +inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +template +void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar, + &elementwise_arithm_op_quantized_broadcast_loop, + &elementwise_arithm_op_quantized_loop); +} + +template +void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar, + &elementwise_arithm_op_quantized_signed_broadcast_loop, + &elementwise_arithm_op_quantized_singed_loop); +} + +template +void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, + &elementwise_comp_op_quantized_broadcast_loop, + &elementwise_comp_op_quantized_loop); +} + +template +void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, + &elementwise_comp_op_quantized_signed_broadcast_loop, + &elementwise_comp_op_quantized_signed_loop); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp new file mode 100644 index 0000000000..c5c528d3f3 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op>(in1, in2, out, window); +} + +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op>(in1, in2, out, window); +} +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_8(in1, in2, out, window); +} +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_16(in1, in2, out, window); +} +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_32(in1, in2, out, window); +} +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..fa8e08745a --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op_quantized(in1, in2, out, window); +} + +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_quantized(in1, in2, out, window); +} + +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..abfdf93b75 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template +void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op_quantized_signed(in1, in2, out, window); +} + +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_quantized_signed(in1, in2, out, window); +} + +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp new file mode 100644 index 0000000000..d764f56623 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op(in1, in2, out, window); +} + +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op(in1, in2, out, window); +} + +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp new file mode 100644 index 0000000000..bb33fd2814 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op(in1, in2, out, window); +} + +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op(in1, in2, out, window); +} +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp new file mode 100644 index 0000000000..b3046e90a8 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +#include "src/core/NEON/SVEMath.h" +#include + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template +struct LoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + const InputScalarType *input2_ptr; + OutputScalarType *output_ptr; +}; + +template +struct BroadcastLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + InputScalarType broadcast_value; + OutputScalarType *output_ptr; + bool reorder; +}; + +template +void arithmetic_op_loop(svbool_t pg, const LoopArguments &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template +void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template +void comparison_op_loop(svbool_t pg, const LoopArguments &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate(pg); + svst1(output_pg, args.output_ptr, res); +} + +template +void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate(pg); + svst1(output_pg, args.output_ptr, res); +} + +template +using LoopFuncType = void (*)(svbool_t, const LoopArguments &); + +template +using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments &); + +template ::type, + typename OutputScalarType = typename sve_scalar::type> +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OperatorType op, + LoopFuncType func, + BroadcastLoopFuncType broadcast_func) +{ + const auto all_true_pg = svptrue(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt(x, window_end_x); + do + { + broadcast_func(pg, + { + op, + non_broadcast_input_ptr + x, + broadcast_value, + output_ptr + x, + !is_broadcast_input_2 + }); + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt(x, window_end_x); + do + { + func(pg, + { + op, + input1_ptr + x, + input2_ptr + x, + output_ptr + x + }); + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using VectorType = typename sve_vector::type; + + elementwise_op(in1, in2, out, window, op, + &arithmetic_op_loop, + &arithmetic_op_broadcast_loop); +} +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + using InputVectorType = typename sve_vector::type; + using OutputVectorType = typename sve_vector::type; + + elementwise_op(in1, in2, out, window, op, + &comparison_op_loop, + &comparison_op_broadcast_loop); +} + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <> +svint32_t elementwise_pow(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint32_t elementwise_div(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint16_t elementwise_div(svbool_t &pg, const svint16_t &a, const svint16_t &b) +{ + ARM_COMPUTE_UNUSED(pg, a, b); + ARM_COMPUTE_ERROR("Not supported"); +} + +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h new file mode 100644 index 0000000000..b7425c8626 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H +#if defined(ARM_COMPUTE_ENABLE_SVE) + +#include "arm_compute/core/Helpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/NEON/wrapper/svtraits.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template +VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) +{ + return svpow_z(pg, a, b); +} + +template +VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) +{ + return svdiv_z(pg, a, b); +} + +template +svbool_t narrow_to_byte_predicate(svbool_t pg) +{ + const auto all_false = svpfalse(); + + switch(bytewidth) + { + case 8: + pg = svuzp1_b32(pg, all_false); + /* fall through */ + case 4: + pg = svuzp1_b16(pg, all_false); + /* fall through */ + case 2: + pg = svuzp1_b8(pg, all_false); + /* fall through */ + default: + break; + } + return pg; +} + +template +VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) +{ + using ScalarType = typename wrapper::sve_scalar::type; + VectorType res{}; + + switch(op) + { + case ArithmeticOperation::MAX: + res = svmax_z(pg, a, b); + break; + case ArithmeticOperation::MIN: + res = svmin_z(pg, a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const auto tmp = svsub_z(pg, a, b); + res = svmul_z(pg, tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const auto zero = svdup_n(ScalarType(0)); + const auto tmp = svmul_z(pg, a, b); + const auto gt = svcmpgt(pg, a, zero); + res = svsel(gt, a, tmp); + break; + } + case ArithmeticOperation::DIV: + { + res = elementwise_div(pg, a, b); + break; + } + case ArithmeticOperation::POWER: + { + res = elementwise_pow(pg, a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template +OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +{ + svbool_t selection_vector{}; + + switch(op) + { + case ComparisonOperation::Equal: + selection_vector = svcmpeq(pg, a, b); + break; + case ComparisonOperation::NotEqual: + selection_vector = svcmpne(pg, a, b); + break; + case ComparisonOperation::Greater: + selection_vector = svcmpgt(pg, a, b); + break; + case ComparisonOperation::GreaterEqual: + selection_vector = svcmpge(pg, a, b); + break; + case ComparisonOperation::Less: + selection_vector = svcmplt(pg, a, b); + break; + case ComparisonOperation::LessEqual: + selection_vector = svcmple(pg, a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + using InputScalarType = typename wrapper::sve_scalar::type; + selection_vector = narrow_to_byte_predicate(selection_vector); + + using OutputScalarType = typename wrapper::sve_scalar::type; + const auto false_vector = svdup_n(static_cast((uint32_t)0)); + const auto true_vector = svdup_n(static_cast(~(uint32_t)0)); + auto ret = svsel(selection_vector, true_vector, false_vector); + + return ret; +} + +template +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp new file mode 100644 index 0000000000..a4f4d0fc82 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op(in1, in2, out, window); +} +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op(in1, in2, out, window); +} +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op(in1, in2, out, window); +} +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op(in1, in2, out, window); +} +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op(in1, in2, out, window); +} +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h new file mode 100644 index 0000000000..c35ca2d6c3 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H + +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template +struct QuantizedLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + const InputScalarType *input2_ptr; + OutputScalarType *output_ptr; + + const svint32_t &in1_offset; + const svint32_t &in2_offset; + const svint32_t &out_offset; + const svfloat32_t &in1_scale; + const svfloat32_t &in2_scale; + const svfloat32_t &out_scale; +}; + +template +struct BroadcastQuantizedLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + float broadcast_value; + OutputScalarType *output_ptr; + bool reorder; + + const svint32_t &in1_offset; + const svint32_t &out_offset; + const svfloat32_t &in1_scale; + const svfloat32_t &out_scale; +}; + +inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +{ + auto x = svld1(pg, ptr); + + const auto widened = svcreate4( + svmovlb(svmovlb(x)), + svmovlt(svmovlb(x)), + svmovlb(svmovlt(x)), + svmovlt(svmovlt(x))); + + pg = svptrue_b8(); + + return svcreate4( + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); +} + +inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +{ + auto x = svld1(pg, ptr); + + //vprint(x); + + const auto widened = svcreate4( + svmovlb(svmovlb(x)), + svmovlt(svmovlb(x)), + svmovlb(svmovlt(x)), + svmovlt(svmovlt(x))); + + pg = svptrue_b8(); + + return svcreate4( + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); +} + +inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +{ + const auto quantized = svcreate4( + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + + const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); + const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); + const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); + svst1(pg, ptr, narrowed); +} + +inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +{ + const auto quantized = svcreate4( + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + + const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); + const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); + const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); + + svst1(pg, ptr, narrowed); +} + +template +inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + + const auto result = svcreate4( + elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), + elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), + elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), + elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + + store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); +} + +template +inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = svcreate4( + svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + + const auto &af = args.reorder ? in2 : in1; + const auto &bf = args.reorder ? in1 : in2; + + const auto result = svcreate4( + elementwise_arithmetic_op(pg, svget4(af, 0), svget4(bf, 0), args.op), + elementwise_arithmetic_op(pg, svget4(af, 1), svget4(bf, 1), args.op), + elementwise_arithmetic_op(pg, svget4(af, 2), svget4(bf, 2), args.op), + elementwise_arithmetic_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); + + store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); +} + +template +inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + + using OutputVectorType = typename wrapper::traits::sve_vector::type; + + const auto result = svcreate4( + elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), + elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), + elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), + elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, args.output_ptr, zipped); +} + +template +inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = svcreate4( + svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + + const auto &af = args.reorder ? in2 : in1; + const auto &bf = args.reorder ? in1 : in2; + + using OutputVectorType = typename wrapper::traits::sve_vector::type; + + const auto result = svcreate4( + elementwise_comparison_op(pg, svget4(af, 0), svget4(bf, 0), args.op), + elementwise_comparison_op(pg, svget4(af, 1), svget4(bf, 1), args.op), + elementwise_comparison_op(pg, svget4(af, 2), svget4(bf, 2), args.op), + elementwise_comparison_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, args.output_ptr, zipped); +} + +template +using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments &); + +template +using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments &); + +template ::type, + typename OutputScalarType = typename wrapper::sve_scalar::type> +void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OperatorType op, + LoopQuantizedFuncType func, + BroadcastQuantizedLoopFuncType broadcast_func) +{ + const auto all_true_pg = wrapper::svptrue(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); + const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + + const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); + const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto args = BroadcastQuantizedLoopArguments + { + op, + non_broadcast_input_ptr + x, + Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo), + output_ptr + x, + !is_broadcast_input_2, + non_broadcast_voffset, output_voffset, + non_broadcast_vscale, output_vscale + }; + broadcast_func(pg, args); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); + const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); + + const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); + const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto args = QuantizedLoopArguments + { + op, + input1_ptr + x, + input2_ptr + x, + output_ptr + x, + in1_voffset, in2_voffset, output_voffset, + in1_vscale, in2_vscale, output_vscale + }; + func(pg, args); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template +void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using VectorType = typename wrapper::traits::sve_vector::type; + elementwise_quantized_op(in1, in2, out, window, op, + &arithmetic_op_quantized_loop, + &arithmetic_op_broadcast_quantized_loop); +} + +template +void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + using InputVectorType = typename wrapper::traits::sve_vector::type; + using OutputVectorType = typename wrapper::traits::sve_vector::type; + elementwise_quantized_op(in1, in2, out, window, op, + &comparison_op_quantized_loop, + &comparison_op_broadcast_quantized_loop); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ \ No newline at end of file diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp new file mode 100644 index 0000000000..63c75c3d4d --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_quantized_op(in1, in2, out, window); +} + +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_quantized_op(in1, in2, out, window); +} + +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE2) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp new file mode 100644 index 0000000000..fe332df386 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template +void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_quantized_op(in1, in2, out, window); +} + +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template +void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_quantized_op(in1, in2, out, window); +} + +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE2) diff --git a/src/cpu/kernels/elementwise_binary/list.h b/src/cpu/kernels/elementwise_binary/list.h new file mode 100644 index 0000000000..78a098e7bb --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/list.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H +#define SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ELEMETWISE_BINARY_KERNEL(func_name) \ + template \ + void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) + +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s32_elementwise_binary); + +#undef DECLARE_ELEMETWISE_BINARY_KERNEL + +#define DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(func_name) \ + template \ + void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) + +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_u8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_u8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp32_comparison_elementwise_binary); +#undef DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H \ No newline at end of file diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp new file mode 100644 index 0000000000..976d006f11 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_op<__fp16>(in, out, window, op); +} +} +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp new file mode 100644 index 0000000000..21f4d9d326 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_op(in, out, window, op); +} +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h new file mode 100644 index 0000000000..fd930ae7cf --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H + +#include "arm_compute/core/Types.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +template +inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return 1 / sqrt(a); + case ElementWiseUnary::EXP: + return std::exp(a); + case ElementWiseUnary::NEG: + return -a; + case ElementWiseUnary::LOG: + return std::log(a); + case ElementWiseUnary::ABS: + return std::abs(a); + case ElementWiseUnary::ROUND: + return support::cpp11::nearbyint(a); + case ElementWiseUnary::SIN: + return std::sin(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template +inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return wrapper::vinvsqrt(a); + case ElementWiseUnary::EXP: + return wrapper::vexpq(a); + case ElementWiseUnary::NEG: + return wrapper::vneg(a); + case ElementWiseUnary::LOG: + return wrapper::vlog(a); + case ElementWiseUnary::ABS: + return wrapper::vabs(a); + case ElementWiseUnary::ROUND: + return wrapper::vround(a); + case ElementWiseUnary::SIN: + return wrapper::vsin(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template +void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + + int x = window_start_x; + for(; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, elementwise_op_imp(op, wrapper::vloadq(input_ptr + x))); + } + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); + } + }, + input, output); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp new file mode 100644 index 0000000000..ef3120e206 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_op(in, out, window, op); +} +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp new file mode 100644 index 0000000000..4dd4a1905c --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_sve_op(in, out, window, op); +} +} +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp new file mode 100644 index 0000000000..3498a0b1ea --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_sve_op(in, out, window, op); +} +} +} // namespace arm_compute +#endif //ARM_COMPUTE_ENABLE_SVE diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp new file mode 100644 index 0000000000..0c04a56be0 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +template +inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return svinvsqrt(pg, a); + case ElementWiseUnary::EXP: + return wrapper::svexp_z(pg, a); + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::LOG: + return wrapper::svlog_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + case ElementWiseUnary::ROUND: + return svrintn_z(pg, a); + case ElementWiseUnary::SIN: + return wrapper::svsin_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template +inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const auto all_true_pg = wrapper::svptrue(); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input, output); +} + +template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.h b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h new file mode 100644 index 0000000000..08f4438696 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H +#if defined(ARM_COMPUTE_ENABLE_SVE) +namespace arm_compute +{ +namespace cpu +{ +template +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp new file mode 100644 index 0000000000..c3e3adfc9e --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_sve_op(in, out, window, op); +} +} +} // namespace arm_compute +#endif //ARM_COMPUTE_ENABLE_SVE diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h new file mode 100644 index 0000000000..2a41b74c51 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/list.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H +#define SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ELEMETWISE_UNARY_KERNEL(func_name) \ + void func_name(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) + +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp16_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_s32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary); + +#undef DECLARE_ELEMETWISE_UNARY_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H \ No newline at end of file -- cgit v1.2.1