diff options
author | Dana Zlotnik <dana.zlotnik@arm.com> | 2021-11-28 14:46:12 +0200 |
---|---|---|
committer | Dana Zlotnik <dana.zlotnik@arm.com> | 2022-01-19 14:24:13 +0000 |
commit | d5c496d87e3b446532dd3dd163e9768de0daff4e (patch) | |
tree | 901d9fe6dd369edc7568a558302b87bfd8623616 | |
parent | c48a3e5431ac48fbbd53522e34c99ea4f4ce3e41 (diff) | |
download | ComputeLibrary-d5c496d87e3b446532dd3dd163e9768de0daff4e.tar.gz |
Decouple CpuElementwiseKernel
1- reorganize the folders struct according the new definition
2- separate between unary and binary implementations
3- decuple kernels - unary , binary op and binary comparision
Resolves COMPMID-4634
Change-Id: I0195846cc372e74a63c659069a4508de53a22110
Signed-off-by: Dana Zlotnik <dana.zlotnik@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6860
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
30 files changed, 1682 insertions, 685 deletions
diff --git a/Android.bp b/Android.bp index 0ce9f09290..7e18b3dbb9 100644 --- a/Android.bp +++ b/Android.bp @@ -461,8 +461,24 @@ cc_library_static { "src/cpu/kernels/crop/generic/neon/fp32.cpp", "src/cpu/kernels/crop/generic/neon/impl.cpp", "src/cpu/kernels/crop/generic/neon/integer.cpp", - "src/cpu/kernels/elementwise/sve/elementwise.cpp", - "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp", + "src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp", + "src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp", + "src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp", + "src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp", + "src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp", + "src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp", + "src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp", + "src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp", + "src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp", + "src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp", + "src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp", + "src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp", + "src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp", "src/cpu/kernels/floor/neon/fp16.cpp", "src/cpu/kernels/floor/neon/fp32.cpp", "src/cpu/kernels/genproposals/generic/neon/fp16.cpp", diff --git a/filelist.json b/filelist.json index a306845561..a4773ded04 100644 --- a/filelist.json +++ b/filelist.json @@ -1259,10 +1259,22 @@ "common": [ "src/cpu/operators/CpuElementwise.cpp", "src/cpu/kernels/CpuElementwiseKernel.cpp", - "src/runtime/NEON/functions/NEElementwiseOperations.cpp" + "src/runtime/NEON/functions/NEElementwiseOperations.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp", + "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp" ], + "neon":{ + "fp32": ["src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp"], + "fp16": ["src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp"], + "integer": ["src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp"] + }, "sve": { - "common": [ "src/cpu/kernels/elementwise/sve/elementwise.cpp" ] + "common": ["src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp" ], + "integer": ["src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp"], + "fp32": ["src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp"], + "fp16": ["src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp"], + "qasymm8": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp"], + "qasymm8_signed": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp"] } } }, @@ -1273,8 +1285,16 @@ "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp", "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp" ], + "neon": { + "integer": ["src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp"], + "fp32": ["src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp"], + "fp16": ["src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp"] + }, "sve": { - "common": [ "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" ] + "common": ["src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp" ], + "integer": ["src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp"], + "fp32": ["src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp"], + "fp16": ["src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp"] } } }, diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp index 91de24b850..53179ae95f 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,10 +28,7 @@ #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/cpu/kernels/elementwise/neon/elementwise_list.h" -#include "src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h" +#include "src/cpu/kernels/elementwise_binary/list.h" #include <arm_neon.h> @@ -68,76 +65,73 @@ CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, { "sve_fp32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>)) + REGISTER_FP32_SVE((arm_compute::cpu::sve_fp32_elementwise_binary<op>)) }, { "sve_s32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>)) + REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s32_elementwise_binary<op>)) }, { "sve_s16_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>)) + REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s16_elementwise_binary<op>)) + }, + { + "sve_fp16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE((arm_compute::cpu::sve_fp16_elementwise_binary<op>)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp32_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>)) + REGISTER_FP32_NEON((arm_compute::cpu::neon_fp32_elementwise_binary<op>)) }, { "neon_s32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>)) + REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s32_elementwise_binary<op>)) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, + REGISTER_FP16_NEON((arm_compute::cpu::neon_fp16_elementwise_binary<op>)) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_s16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s16_elementwise_binary<op>)) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { "sve2_qu8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>)) + REGISTER_QASYMM8_SVE2((arm_compute::cpu::sve2_qasymm8_elementwise_binary<op>)) }, { "sve2_qs8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>)) + REGISTER_QASYMM8_SIGNED_SVE2((arm_compute::cpu::sve2_qasymm8_signed_elementwise_binary<op>)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { "neon_qu8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>)) + REGISTER_QASYMM8_NEON((arm_compute::cpu::neon_qasymm8_elementwise_binary<op>)) }, { "neon_qs8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>)) + REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::neon_qasymm8_signed_elementwise_binary<op>)) }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>)) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_s16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ }; for(const auto &uk : kernels) @@ -161,82 +155,82 @@ CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, c { "sve_u8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>)) + REGISTER_INTEGER_SVE(arm_compute::cpu::sve_u8_comparison_elementwise_binary<op>) }, { "sve_fp32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>)) + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_comparison_elementwise_binary<op>) }, { "sve_s16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>)) + REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s16_comparison_elementwise_binary<op>) }, { "sve_s32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>)) + REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s32_comparison_elementwise_binary<op>) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_u8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>)) + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_comparison_elementwise_binary<op>) }, { "neon_fp32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>)) + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_comparison_elementwise_binary<op>) }, { "neon_s16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>)) + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_comparison_elementwise_binary<op>) }, { "neon_s32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>)) + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_comparison_elementwise_binary<op>) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { "sve2_qu8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>)) + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_comparison_elementwise_binary<op>) }, { "sve2_qs8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>)) + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_comparison_elementwise_binary<op>) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { "neon_qu8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>)) + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_comparison_elementwise_binary<op>) }, { "neon_qs8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>)) + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_comparison_elementwise_binary<op>) }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON ||ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_SVE) { "sve_fp16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>)) + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_comparison_elementwise_binary<op>) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_fp16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>)) + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_comparison_elementwise_binary<op>) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ }; diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 61bc64b235..f3a82c23f0 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -31,8 +31,7 @@ #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/cpu/kernels/elementwise/neon/elementwise_unary_list.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_unary_list.h" +#include "src/cpu/kernels/elementwise_unary/list.h" #include "support/ToolchainSupport.h" namespace arm_compute @@ -52,7 +51,7 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai { return data.dt == DataType::F32 && data.isa.sve; }, - REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>) + REGISTER_FP32_SVE(sve_fp32_elementwise_unary) }, { "sve_fp16_elementwise_unary", @@ -60,31 +59,31 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai { return (data.dt == DataType::F16) && data.isa.sve; }, - REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>), + REGISTER_FP16_SVE(sve_fp16_elementwise_unary), }, { "sve_s32_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::S32 && data.isa.sve; }, - REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>), + REGISTER_INTEGER_SVE(sve_s32_elementwise_unary), }, #endif // defined(ARM_COMPUTE_ENABLE_SVE) #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp32_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>), + REGISTER_FP32_NEON(neon_fp32_elementwise_unary), }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_fp16_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>), + REGISTER_FP32_NEON(neon_fp16_elementwise_unary), }, #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_s32_elementwise_unary", [](const DataTypeISASelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>), + REGISTER_INTEGER_NEON(neon_s32_elementwise_unary), }, #endif // defined(ARM_COMPUTE_ENABLE_NEON) }; diff --git a/src/cpu/kernels/elementwise/neon/elementwise_list.h b/src/cpu/kernels/elementwise/neon/elementwise_list.h deleted file mode 100644 index 43e44be5e2..0000000000 --- a/src/cpu/kernels/elementwise/neon/elementwise_list.h +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H - -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); - for(; x < window_end_x; ++x) - { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); - } -} - -template <ArithmeticOperation op, typename ScalarType> -inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) -{ - auto res = ScalarType(0); - - switch(op) - { - case ArithmeticOperation::MAX: - res = std::max(a, b); - break; - case ArithmeticOperation::MIN: - res = std::min(a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - res = (a - b) * (a - b); - break; - } - case ArithmeticOperation::PRELU: - { - res = (a > 0 ? a : a * b); - break; - } - case ArithmeticOperation::DIV: - { - res = a / b; - if(std::is_integral<ScalarType>::value) - { - res = (b == 0) ? 0 : res; - if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0))) - { - --res; - } - } - break; - } - case ArithmeticOperation::POWER: - { - res = std::pow(a, b); - break; - } - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template <ArithmeticOperation op, typename VectorType> -inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) -{ - using vec_type = typename VectorType::type; - using scalar_type = typename VectorType::scalar_type; - using tag_type = typename VectorType::tag_type; - - vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); - - switch(op) - { - case ArithmeticOperation::MAX: - res = wrapper::vmax(a, b); - break; - case ArithmeticOperation::MIN: - res = wrapper::vmin(a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - const vec_type tmp = wrapper::vsub(a, b); - res = wrapper::vmul(tmp, tmp); - break; - } - case ArithmeticOperation::PRELU: - { - const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); - const vec_type tmp = wrapper::vmul(a, b); - const auto gt = wrapper::vcgt(a, zero); - - res = wrapper::vbsl(gt, a, tmp); - break; - } - - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template <> -inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b) -{ - return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); -} - -template <> -inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) -{ - return wrapper::vdiv(a, b); -} - -template <> -inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) -{ - return wrapper::vpow(a, b); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) -{ - return wrapper::vdiv(a, b); -} - -template <> -inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) -{ - return wrapper::vpow(a, b); -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <ArithmeticOperation op, typename ScalarType, typename VectorType> -inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) -{ - using tag_type = typename VectorType::tag_type; - using vec_type = typename VectorType::type; - - vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); - return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template <ArithmeticOperation op, typename ScalarType, typename VectorType> -inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b)); - } - return x; -} - -template <ArithmeticOperation op, typename ScalarType, typename VectorType> -inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder)); - } - return x; -} - -template <ArithmeticOperation op, typename VectorType> -void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using scalar_type = typename VectorType::scalar_type; - - elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window, - &elementwise_arithm_op_scalar<op, scalar_type>, - &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>, - &elementwise_arithm_op_loop<op, scalar_type, VectorType>); -} - -template <ComparisonOperation op, typename InputScalarType> -inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) -{ - bool res = false; - - switch(op) - { - case ComparisonOperation::Equal: - res = (a == b); - break; - case ComparisonOperation::NotEqual: - res = (a != b); - break; - case ComparisonOperation::Greater: - res = (a > b); - break; - case ComparisonOperation::GreaterEqual: - res = (a >= b); - break; - case ComparisonOperation::Less: - res = (a < b); - break; - case ComparisonOperation::LessEqual: - res = (a <= b); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0); -} - -template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType> -inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) -{ - OutputVectorType res = { 0, 0, 0, 0 }; - - switch(op) - { - case ComparisonOperation::Equal: - res = wrapper::vceq(a, b); - break; - case ComparisonOperation::NotEqual: - res = wrapper::vnot(wrapper::vceq(a, b)); - break; - case ComparisonOperation::Greater: - res = wrapper::vcgt(a, b); - break; - case ComparisonOperation::GreaterEqual: - res = wrapper::vcge(a, b); - break; - case ComparisonOperation::Less: - res = wrapper::vcgt(b, a); - break; - case ComparisonOperation::LessEqual: - res = wrapper::vcge(b, a); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType> -inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) -{ - InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, a); - } - return x; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); - } - return x; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); - const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); - } - if(x <= window_end_x - 4) - { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - for(int i = 0; i < 4; i++) - { - *(output_ptr + x + i) = wrapper::vgetlane(a, i); - } - x = +4; - } - return x; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b); - wrapper::vstore(output_ptr + x, res); - } - return x; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b); - wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); - } - return x; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto a = wrapper::vloadq(input1_ptr + x); - auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); - a = wrapper::vloadq(input1_ptr + x + 4); - b = wrapper::vloadq(input2_ptr + x + 4); - const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); - wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); - } - if(x <= window_end_x - 4) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); - for(int i = 0; i < 4; i++) - { - *(output_ptr + x + i) = wrapper::vgetlane(res, i); - } - x = +4; - } - return x; -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, - &elementwise_comp_op_scalar<op, InputScalarType>, - &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>, - &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>); -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, - &elementwise_comp_op_scalar<op, InputScalarType>, - &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>, - &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>); -} - -template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, - &elementwise_comp_op_scalar<op, InputScalarType>, - &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>, - &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>); -} -} // namesapce cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp new file mode 100644 index 0000000000..6091ef215e --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window); +} + +template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window); +} + +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp new file mode 100644 index 0000000000..2d8fec91c5 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window); +} + +template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window); +} +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h index 3b4c112770..ead54ab14e 100644 --- a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,16 +21,464 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H -#include "src/cpu/kernels/elementwise/neon/elementwise_list.h" +#include "src/core/NEON/NEAsymm.h" namespace arm_compute { namespace cpu { -float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +template <ArithmeticOperation op, typename VectorType> +typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) +{ + using vec_type = typename VectorType::type; + using scalar_type = typename VectorType::scalar_type; + using tag_type = typename VectorType::tag_type; + + vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); + + switch(op) + { + case ArithmeticOperation::MAX: + res = wrapper::vmax(a, b); + break; + case ArithmeticOperation::MIN: + res = wrapper::vmin(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const vec_type tmp = wrapper::vsub(a, b); + res = wrapper::vmul(tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); + const vec_type tmp = wrapper::vmul(a, b); + const auto gt = wrapper::vcgt(a, zero); + + res = wrapper::vbsl(gt, a, tmp); + break; + } + + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) +{ + using tag_type = typename VectorType::tag_type; + using vec_type = typename VectorType::type; + + vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); + return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); + for(; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op, typename ScalarType> +inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch(op) + { + case ArithmeticOperation::MAX: + res = std::max(a, b); + break; + case ArithmeticOperation::MIN: + res = std::min(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + res = (a - b) * (a - b); + break; + } + case ArithmeticOperation::PRELU: + { + res = (a > 0 ? a : a * b); + break; + } + case ArithmeticOperation::DIV: + { + res = a / b; + if(std::is_integral<ScalarType>::value) + { + res = (b == 0) ? 0 : res; + if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0))) + { + --res; + } + } + break; + } + case ArithmeticOperation::POWER: + { + res = std::pow(a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <> +inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b) +{ + return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); +} + +template <> +inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) +{ + return wrapper::vpow(a, b); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vpow(a, b); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b)); + } + return x; +} + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder)); + } + return x; +} + +template <ArithmeticOperation op, typename VectorType> +void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using scalar_type = typename VectorType::scalar_type; + + elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window, + &elementwise_arithm_op_scalar<op, scalar_type>, + &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>, + &elementwise_arithm_op_loop<op, scalar_type, VectorType>); +} + +template <ComparisonOperation op, typename InputScalarType> +inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) +{ + bool res = false; + + switch(op) + { + case ComparisonOperation::Equal: + res = (a == b); + break; + case ComparisonOperation::NotEqual: + res = (a != b); + break; + case ComparisonOperation::Greater: + res = (a > b); + break; + case ComparisonOperation::GreaterEqual: + res = (a >= b); + break; + case ComparisonOperation::Less: + res = (a < b); + break; + case ComparisonOperation::LessEqual: + res = (a <= b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0); +} + +template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType> +inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) +{ + OutputVectorType res = { 0, 0, 0, 0 }; + + switch(op) + { + case ComparisonOperation::Equal: + res = wrapper::vceq(a, b); + break; + case ComparisonOperation::NotEqual: + res = wrapper::vnot(wrapper::vceq(a, b)); + break; + case ComparisonOperation::Greater: + res = wrapper::vcgt(a, b); + break; + case ComparisonOperation::GreaterEqual: + res = wrapper::vcge(a, b); + break; + case ComparisonOperation::Less: + res = wrapper::vcgt(b, a); + break; + case ComparisonOperation::LessEqual: + res = wrapper::vcge(b, a); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType> +inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) +{ + InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, a); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); + const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); + } + if(x <= window_end_x - 4) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + for(int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(a, i); + } + x = +4; + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b); + wrapper::vstore(output_ptr + x, res); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto a = wrapper::vloadq(input1_ptr + x); + auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + a = wrapper::vloadq(input1_ptr + x + 4); + b = wrapper::vloadq(input2_ptr + x + 4); + const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); + } + if(x <= window_end_x - 4) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + for(int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(res, i); + } + x = +4; + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, + &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, + &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, + &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>); +} + +inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { qasymm8x16_t x = vld1q_u8(input1_ptr); const float32x4x4_t out = @@ -45,7 +493,7 @@ float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, return out; } -float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { qasymm8x16_signed_t x = vld1q_s8(input1_ptr); const float32x4x4_t out = @@ -60,21 +508,21 @@ float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &o return out; } -void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) +inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) { const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3]))); vst1q_u8(output_ptr, vcombine_u8(pa, pb)); } -void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) +inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) { const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); vst1q_u8(output_ptr, vcombine_u8(pa, pb)); } -void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) { int32x4x4_t out = { @@ -88,14 +536,14 @@ void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32 store_quantized(output_ptr, out); } -void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) +inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) { const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); vst1q_s8(output_ptr, vcombine_s8(pa, pb)); } -void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) { int32x4x4_t out = { @@ -122,7 +570,7 @@ inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, cons } template <ArithmeticOperation op> -inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) +float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) { using neon_vector_float = wrapper::traits::neon_vector<float, 4>; float32x4x4_t out = @@ -296,13 +744,13 @@ inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_ return x; } -void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -404,13 +852,13 @@ void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *o } } -void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -511,13 +959,13 @@ void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, I } } -void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -625,6 +1073,7 @@ void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITe &elementwise_arithm_op_quantized_broadcast_loop<op>, &elementwise_arithm_op_quantized_loop<op>); } + template <ArithmeticOperation op> void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { @@ -651,4 +1100,4 @@ void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2 } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ +#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp new file mode 100644 index 0000000000..c5c528d3f3 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window); +} + +template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ArithmeticOperation op> +void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window); +} +template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window); +} +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window); +} +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window); +} +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..fa8e08745a --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op_quantized<op>(in1, in2, out, window); +} + +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_quantized<op>(in1, in2, out, window); +} + +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..abfdf93b75 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window); +} + +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window); +} + +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp new file mode 100644 index 0000000000..d764f56623 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<op, float16_t>(in1, in2, out, window); +} + +template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<op, float16_t>(in1, in2, out, window); +} + +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp new file mode 100644 index 0000000000..bb33fd2814 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<op, float32_t>(in1, in2, out, window); +} + +template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<op, float>(in1, in2, out, window); +} +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise/sve/elementwise.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp index 2f9a7998df..b3046e90a8 100644 --- a/src/cpu/kernels/elementwise/sve/elementwise.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,10 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +#include "src/core/NEON/SVEMath.h" #include <arm_sve.h> namespace arm_compute @@ -209,6 +208,41 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * &arithmetic_op_loop<ScalarType, ScalarType>, &arithmetic_op_broadcast_loop<ScalarType, ScalarType>); } +template void elementwise_arithmetic_op<ArithmeticOperation::ADD, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SUB, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::ADD, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SUB, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::ADD, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SUB, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::ADD, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SUB, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t> void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) @@ -222,6 +256,41 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>); } +template void elementwise_comparison_op<ComparisonOperation::Equal, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + template <> svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) { @@ -241,71 +310,6 @@ svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svi ARM_COMPUTE_ERROR("Not supported"); } -template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ diff --git a/src/cpu/kernels/elementwise/sve/elementwise_list.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h index f762587ce7..b7425c8626 100644 --- a/src/cpu/kernels/elementwise/sve/elementwise_list.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,16 +24,10 @@ #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H #if defined(ARM_COMPUTE_ENABLE_SVE) + #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/core/NEON/wrapper/svtraits.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" -#include <arm_sve.h> namespace arm_compute { diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp new file mode 100644 index 0000000000..a4f4d0fc82 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<op, int32_t>(in1, in2, out, window); +} +template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ArithmeticOperation op> +void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<op, int16_t>(in1, in2, out, window); +} +template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<op, uint8_t>(in1, in2, out, window); +} +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<op, int16_t>(in1, in2, out, window); +} +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<op, int32_t>(in1, in2, out, window); +} +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h index a5d17a86a7..c35ca2d6c3 100644 --- a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,10 +25,7 @@ #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H #if defined(ARM_COMPUTE_ENABLE_SVE2) - -#include "src/core/NEON/wrapper/svtraits.h" -#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" - +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { namespace cpu @@ -66,7 +63,7 @@ struct BroadcastQuantizedLoopArguments const svfloat32_t &out_scale; }; -svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) { auto x = svld1(pg, ptr); @@ -85,7 +82,7 @@ svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &of svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); } -svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) { auto x = svld1(pg, ptr); @@ -106,7 +103,7 @@ svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &o svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); } -void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) { const auto quantized = svcreate4( svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), @@ -120,7 +117,7 @@ void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint3 svst1(pg, ptr, narrowed); } -void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) { const auto quantized = svcreate4( svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp new file mode 100644 index 0000000000..63c75c3d4d --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_quantized_op<op, uint8_t>(in1, in2, out, window); +} + +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_quantized_op<op, uint8_t>(in1, in2, out, window); +} + +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE2) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp new file mode 100644 index 0000000000..fe332df386 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_quantized_op<op, int8_t>(in1, in2, out, window); +} + +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op> +void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_quantized_op<op, int8_t>(in1, in2, out, window); +} + +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE2) diff --git a/src/cpu/kernels/elementwise_binary/list.h b/src/cpu/kernels/elementwise_binary/list.h new file mode 100644 index 0000000000..78a098e7bb --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/list.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H +#define SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ELEMETWISE_BINARY_KERNEL(func_name) \ + template <ArithmeticOperation op> \ + void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) + +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s32_elementwise_binary); + +#undef DECLARE_ELEMETWISE_BINARY_KERNEL + +#define DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(func_name) \ + template <ComparisonOperation op> \ + void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) + +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_u8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_u8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp32_comparison_elementwise_binary); +#undef DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp new file mode 100644 index 0000000000..976d006f11 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_op<__fp16>(in, out, window, op); +} +} +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp new file mode 100644 index 0000000000..21f4d9d326 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_op<float>(in, out, window, op); +} +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h index 307e95fae9..fd930ae7cf 100644 --- a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h +++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp new file mode 100644 index 0000000000..ef3120e206 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_op<int32_t>(in, out, window, op); +} +} +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp new file mode 100644 index 0000000000..4dd4a1905c --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_sve_op<float16_t>(in, out, window, op); +} +} +} // namespace arm_compute +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp new file mode 100644 index 0000000000..3498a0b1ea --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_sve_op<float32_t>(in, out, window, op); +} +} +} // namespace arm_compute +#endif //ARM_COMPUTE_ENABLE_SVE diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp index ddf1febd66..0c04a56be0 100644 --- a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,15 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(__ARM_FEATURE_SVE) +#if defined(ARM_COMPUTE_ENABLE_SVE) #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include <arm_sve.h> namespace arm_compute { @@ -108,6 +103,7 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); + } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file +#endif //defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h index c2b495f27c..08f4438696 100644 --- a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,10 +23,7 @@ */ #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H - -#include "arm_compute/core/Types.h" #if defined(ARM_COMPUTE_ENABLE_SVE) - namespace arm_compute { namespace cpu diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp new file mode 100644 index 0000000000..c3e3adfc9e --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + return elementwise_sve_op<int32_t>(in, out, window, op); +} +} +} // namespace arm_compute +#endif //ARM_COMPUTE_ENABLE_SVE diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h new file mode 100644 index 0000000000..2a41b74c51 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/list.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H +#define SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ELEMETWISE_UNARY_KERNEL(func_name) \ + void func_name(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) + +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp16_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_s32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary); + +#undef DECLARE_ELEMETWISE_UNARY_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
\ No newline at end of file |