aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDana Zlotnik <dana.zlotnik@arm.com>2021-11-28 14:46:12 +0200
committerDana Zlotnik <dana.zlotnik@arm.com>2022-01-19 14:24:13 +0000
commitd5c496d87e3b446532dd3dd163e9768de0daff4e (patch)
tree901d9fe6dd369edc7568a558302b87bfd8623616
parentc48a3e5431ac48fbbd53522e34c99ea4f4ce3e41 (diff)
downloadComputeLibrary-d5c496d87e3b446532dd3dd163e9768de0daff4e.tar.gz
Decouple CpuElementwiseKernel
1- reorganize the folders struct according the new definition 2- separate between unary and binary implementations 3- decuple kernels - unary , binary op and binary comparision Resolves COMPMID-4634 Change-Id: I0195846cc372e74a63c659069a4508de53a22110 Signed-off-by: Dana Zlotnik <dana.zlotnik@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6860 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp20
-rw-r--r--filelist.json26
-rw-r--r--src/cpu/kernels/CpuElementwiseKernel.cpp96
-rw-r--r--src/cpu/kernels/CpuElementwiseUnaryKernel.cpp15
-rw-r--r--src/cpu/kernels/elementwise/neon/elementwise_list.h486
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp61
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp58
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/impl.h (renamed from src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h)517
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp95
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp59
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp60
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp61
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp60
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp (renamed from src/cpu/kernels/elementwise/sve/elementwise.cpp)146
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/impl.h (renamed from src/cpu/kernels/elementwise/sve/elementwise_list.h)10
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp97
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve2/impl.h (renamed from src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h)15
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp61
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp61
-rw-r--r--src/cpu/kernels/elementwise_binary/list.h72
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp38
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp36
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/impl.h (renamed from src/cpu/kernels/elementwise/neon/elementwise_unary_list.h)2
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp36
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp38
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp38
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp (renamed from src/cpu/kernels/elementwise/sve/elementwise_unary.cpp)12
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/impl.h (renamed from src/cpu/kernels/elementwise/sve/elementwise_unary_list.h)5
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp38
-rw-r--r--src/cpu/kernels/elementwise_unary/list.h48
30 files changed, 1682 insertions, 685 deletions
diff --git a/Android.bp b/Android.bp
index 0ce9f09290..7e18b3dbb9 100644
--- a/Android.bp
+++ b/Android.bp
@@ -461,8 +461,24 @@ cc_library_static {
"src/cpu/kernels/crop/generic/neon/fp32.cpp",
"src/cpu/kernels/crop/generic/neon/impl.cpp",
"src/cpu/kernels/crop/generic/neon/integer.cpp",
- "src/cpu/kernels/elementwise/sve/elementwise.cpp",
- "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp",
+ "src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp",
"src/cpu/kernels/floor/neon/fp16.cpp",
"src/cpu/kernels/floor/neon/fp32.cpp",
"src/cpu/kernels/genproposals/generic/neon/fp16.cpp",
diff --git a/filelist.json b/filelist.json
index a306845561..a4773ded04 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1259,10 +1259,22 @@
"common": [
"src/cpu/operators/CpuElementwise.cpp",
"src/cpu/kernels/CpuElementwiseKernel.cpp",
- "src/runtime/NEON/functions/NEElementwiseOperations.cpp"
+ "src/runtime/NEON/functions/NEElementwiseOperations.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp",
+ "src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp"
],
+ "neon":{
+ "fp32": ["src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp"],
+ "fp16": ["src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp"],
+ "integer": ["src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp"]
+ },
"sve": {
- "common": [ "src/cpu/kernels/elementwise/sve/elementwise.cpp" ]
+ "common": ["src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp" ],
+ "integer": ["src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp"],
+ "fp32": ["src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp"],
+ "fp16": ["src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp"],
+ "qasymm8": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp"],
+ "qasymm8_signed": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp"]
}
}
},
@@ -1273,8 +1285,16 @@
"src/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
"src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp"
],
+ "neon": {
+ "integer": ["src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp"],
+ "fp32": ["src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp"],
+ "fp16": ["src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp"]
+ },
"sve": {
- "common": [ "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" ]
+ "common": ["src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp" ],
+ "integer": ["src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp"],
+ "fp32": ["src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp"],
+ "fp16": ["src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp"]
}
}
},
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index 91de24b850..53179ae95f 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,10 +28,7 @@
#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/elementwise/neon/elementwise_list.h"
-#include "src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h"
+#include "src/cpu/kernels/elementwise_binary/list.h"
#include <arm_neon.h>
@@ -68,76 +65,73 @@ CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0,
{
"sve_fp32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
- REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))
+ REGISTER_FP32_SVE((arm_compute::cpu::sve_fp32_elementwise_binary<op>))
},
{
"sve_s32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
- REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))
+ REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s32_elementwise_binary<op>))
},
{
"sve_s16_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
- REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))
+ REGISTER_INTEGER_SVE((arm_compute::cpu::sve_s16_elementwise_binary<op>))
+ },
+ {
+ "sve_fp16_elementwise",
+ [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+ REGISTER_FP16_SVE((arm_compute::cpu::sve_fp16_elementwise_binary<op>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
"neon_fp32_elementwise",
+
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))
+ REGISTER_FP32_NEON((arm_compute::cpu::neon_fp32_elementwise_binary<op>))
},
{
"neon_s32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))
+ REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s32_elementwise_binary<op>))
+ },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ {
+ "neon_fp16_elementwise",
+ [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
+ REGISTER_FP16_NEON((arm_compute::cpu::neon_fp16_elementwise_binary<op>))
+ },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+ {
+ "neon_s16_elementwise",
+ [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON((arm_compute::cpu::neon_s16_elementwise_binary<op>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
"sve2_qu8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
- REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))
+ REGISTER_QASYMM8_SVE2((arm_compute::cpu::sve2_qasymm8_elementwise_binary<op>))
},
{
"sve2_qs8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
- REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))
+ REGISTER_QASYMM8_SIGNED_SVE2((arm_compute::cpu::sve2_qasymm8_signed_elementwise_binary<op>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
{
"neon_qu8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))
+ REGISTER_QASYMM8_NEON((arm_compute::cpu::neon_qasymm8_elementwise_binary<op>))
},
{
"neon_qs8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))
+ REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::neon_qasymm8_signed_elementwise_binary<op>))
},
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
- {
- "sve_fp16_elementwise",
- [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
- REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))
- },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "neon_fp16_elementwise",
- [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
- REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "neon_s16_elementwise",
- [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))
- },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
};
for(const auto &uk : kernels)
@@ -161,82 +155,82 @@ CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, c
{
"sve_u8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
- REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))
+ REGISTER_INTEGER_SVE(arm_compute::cpu::sve_u8_comparison_elementwise_binary<op>)
},
{
"sve_fp32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
- REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))
+ REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_comparison_elementwise_binary<op>)
},
{
"sve_s16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
- REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))
+ REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s16_comparison_elementwise_binary<op>)
},
{
"sve_s32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
- REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))
+ REGISTER_INTEGER_SVE(arm_compute::cpu::sve_s32_comparison_elementwise_binary<op>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
"neon_u8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_comparison_elementwise_binary<op>)
},
{
"neon_fp32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_comparison_elementwise_binary<op>)
},
{
"neon_s16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_comparison_elementwise_binary<op>)
},
{
"neon_s32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_comparison_elementwise_binary<op>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
"sve2_qu8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
- REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))
+ REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_comparison_elementwise_binary<op>)
},
{
"sve2_qs8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
- REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))
+ REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_comparison_elementwise_binary<op>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
{
"neon_qu8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_comparison_elementwise_binary<op>)
},
{
"neon_qs8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_comparison_elementwise_binary<op>)
},
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON ||ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
"sve_fp16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
- REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))
+ REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_comparison_elementwise_binary<op>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
"neon_fp16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
- REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_comparison_elementwise_binary<op>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
};
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 61bc64b235..f3a82c23f0 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -31,8 +31,7 @@
#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/elementwise/neon/elementwise_unary_list.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_unary_list.h"
+#include "src/cpu/kernels/elementwise_unary/list.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
@@ -52,7 +51,7 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
{
return data.dt == DataType::F32 && data.isa.sve;
},
- REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>)
+ REGISTER_FP32_SVE(sve_fp32_elementwise_unary)
},
{
"sve_fp16_elementwise_unary",
@@ -60,31 +59,31 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
{
return (data.dt == DataType::F16) && data.isa.sve;
},
- REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>),
+ REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
},
{
"sve_s32_elementwise_unary",
[](const DataTypeISASelectorData & data) { return data.dt == DataType::S32 && data.isa.sve; },
- REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
+ REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
},
#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
"neon_fp32_elementwise_unary",
[](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>),
+ REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
"neon_fp16_elementwise_unary",
[](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>),
+ REGISTER_FP32_NEON(neon_fp16_elementwise_unary),
},
#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
"neon_s32_elementwise_unary",
[](const DataTypeISASelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
+ REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
},
#endif // defined(ARM_COMPUTE_ENABLE_NEON)
};
diff --git a/src/cpu/kernels/elementwise/neon/elementwise_list.h b/src/cpu/kernels/elementwise/neon/elementwise_list.h
deleted file mode 100644
index 43e44be5e2..0000000000
--- a/src/cpu/kernels/elementwise/neon/elementwise_list.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
- int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
- int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const auto a = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
- for(; x < window_end_x; ++x)
- {
- const auto a = *(input1_ptr + x);
- const auto b = *(input2_ptr + x);
- *(output_ptr + x) = (*scalar_func)(a, b);
- }
- },
- input1, input2, output);
- }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
-{
- auto res = ScalarType(0);
-
- switch(op)
- {
- case ArithmeticOperation::MAX:
- res = std::max(a, b);
- break;
- case ArithmeticOperation::MIN:
- res = std::min(a, b);
- break;
- case ArithmeticOperation::SQUARED_DIFF:
- {
- res = (a - b) * (a - b);
- break;
- }
- case ArithmeticOperation::PRELU:
- {
- res = (a > 0 ? a : a * b);
- break;
- }
- case ArithmeticOperation::DIV:
- {
- res = a / b;
- if(std::is_integral<ScalarType>::value)
- {
- res = (b == 0) ? 0 : res;
- if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
- {
- --res;
- }
- }
- break;
- }
- case ArithmeticOperation::POWER:
- {
- res = std::pow(a, b);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
-{
- using vec_type = typename VectorType::type;
- using scalar_type = typename VectorType::scalar_type;
- using tag_type = typename VectorType::tag_type;
-
- vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-
- switch(op)
- {
- case ArithmeticOperation::MAX:
- res = wrapper::vmax(a, b);
- break;
- case ArithmeticOperation::MIN:
- res = wrapper::vmin(a, b);
- break;
- case ArithmeticOperation::SQUARED_DIFF:
- {
- const vec_type tmp = wrapper::vsub(a, b);
- res = wrapper::vmul(tmp, tmp);
- break;
- }
- case ArithmeticOperation::PRELU:
- {
- const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
- const vec_type tmp = wrapper::vmul(a, b);
- const auto gt = wrapper::vcgt(a, zero);
-
- res = wrapper::vbsl(gt, a, tmp);
- break;
- }
-
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-
- return res;
-}
-
-template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
-{
- return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
- return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
- return wrapper::vpow(a, b);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
- return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
- return wrapper::vpow(a, b);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
-{
- using tag_type = typename VectorType::tag_type;
- using vec_type = typename VectorType::type;
-
- vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
- return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
- }
- return x;
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
- wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
- }
- return x;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- using scalar_type = typename VectorType::scalar_type;
-
- elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
- &elementwise_arithm_op_scalar<op, scalar_type>,
- &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
- &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType>
-inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
-{
- bool res = false;
-
- switch(op)
- {
- case ComparisonOperation::Equal:
- res = (a == b);
- break;
- case ComparisonOperation::NotEqual:
- res = (a != b);
- break;
- case ComparisonOperation::Greater:
- res = (a > b);
- break;
- case ComparisonOperation::GreaterEqual:
- res = (a >= b);
- break;
- case ComparisonOperation::Less:
- res = (a < b);
- break;
- case ComparisonOperation::LessEqual:
- res = (a <= b);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
-}
-
-template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
-{
- OutputVectorType res = { 0, 0, 0, 0 };
-
- switch(op)
- {
- case ComparisonOperation::Equal:
- res = wrapper::vceq(a, b);
- break;
- case ComparisonOperation::NotEqual:
- res = wrapper::vnot(wrapper::vceq(a, b));
- break;
- case ComparisonOperation::Greater:
- res = wrapper::vcgt(a, b);
- break;
- case ComparisonOperation::GreaterEqual:
- res = wrapper::vcge(a, b);
- break;
- case ComparisonOperation::Less:
- res = wrapper::vcgt(b, a);
- break;
- case ComparisonOperation::LessEqual:
- res = wrapper::vcge(b, a);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-
- return res;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
-{
- InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
- return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- wrapper::vstore(output_ptr + x, a);
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
- const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
- }
- if(x <= window_end_x - 4)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- for(int i = 0; i < 4; i++)
- {
- *(output_ptr + x + i) = wrapper::vgetlane(a, i);
- }
- x = +4;
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
- wrapper::vstore(output_ptr + x, res);
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto a = wrapper::vloadq(input1_ptr + x);
- auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- a = wrapper::vloadq(input1_ptr + x + 4);
- b = wrapper::vloadq(input2_ptr + x + 4);
- const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
- }
- if(x <= window_end_x - 4)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- for(int i = 0; i < 4; i++)
- {
- *(output_ptr + x + i) = wrapper::vgetlane(res, i);
- }
- x = +4;
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
-}
-} // namesapce cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */ \ No newline at end of file
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..6091ef215e
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
+}
+
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
+}
+
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+}
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..2d8fec91c5
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
+}
+
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
+}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 3b4c112770..ead54ab14e 100644
--- a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,16 +21,464 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
+#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
+#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
-#include "src/cpu/kernels/elementwise/neon/elementwise_list.h"
+#include "src/core/NEON/NEAsymm.h"
namespace arm_compute
{
namespace cpu
{
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+template <ArithmeticOperation op, typename VectorType>
+typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
+{
+ using vec_type = typename VectorType::type;
+ using scalar_type = typename VectorType::scalar_type;
+ using tag_type = typename VectorType::tag_type;
+
+ vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
+
+ switch(op)
+ {
+ case ArithmeticOperation::MAX:
+ res = wrapper::vmax(a, b);
+ break;
+ case ArithmeticOperation::MIN:
+ res = wrapper::vmin(a, b);
+ break;
+ case ArithmeticOperation::SQUARED_DIFF:
+ {
+ const vec_type tmp = wrapper::vsub(a, b);
+ res = wrapper::vmul(tmp, tmp);
+ break;
+ }
+ case ArithmeticOperation::PRELU:
+ {
+ const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
+ const vec_type tmp = wrapper::vmul(a, b);
+ const auto gt = wrapper::vcgt(a, zero);
+
+ res = wrapper::vbsl(gt, a, tmp);
+ break;
+ }
+
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+
+ return res;
+}
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
+{
+ using tag_type = typename VectorType::tag_type;
+ using vec_type = typename VectorType::type;
+
+ vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
+ return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+ int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+ int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+{
+ // Create input windows
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+ if(is_broadcast_across_x)
+ {
+ const bool is_broadcast_input_2 = input2_win.x().step() == 0;
+ Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
+ Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+ // Clear X Dimension on execution window as we handle manually
+ non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator broadcast_input(broadcast_tensor, broadcast_win);
+ Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+ const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
+ for(; x < window_end_x; ++x)
+ {
+ const auto a = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
+ }
+ else
+ {
+ // Clear X Dimension on execution window as we handle manually
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+ for(; x < window_end_x; ++x)
+ {
+ const auto a = *(input1_ptr + x);
+ const auto b = *(input2_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(a, b);
+ }
+ },
+ input1, input2, output);
+ }
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
+{
+ auto res = ScalarType(0);
+
+ switch(op)
+ {
+ case ArithmeticOperation::MAX:
+ res = std::max(a, b);
+ break;
+ case ArithmeticOperation::MIN:
+ res = std::min(a, b);
+ break;
+ case ArithmeticOperation::SQUARED_DIFF:
+ {
+ res = (a - b) * (a - b);
+ break;
+ }
+ case ArithmeticOperation::PRELU:
+ {
+ res = (a > 0 ? a : a * b);
+ break;
+ }
+ case ArithmeticOperation::DIV:
+ {
+ res = a / b;
+ if(std::is_integral<ScalarType>::value)
+ {
+ res = (b == 0) ? 0 : res;
+ if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+ {
+ --res;
+ }
+ }
+ break;
+ }
+ case ArithmeticOperation::POWER:
+ {
+ res = std::pow(a, b);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res;
+}
+
+template <>
+inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+{
+ return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
+}
+
+template <>
+inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+{
+ return wrapper::vdiv(a, b);
+}
+
+template <>
+inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+{
+ return wrapper::vpow(a, b);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+{
+ return wrapper::vdiv(a, b);
+}
+
+template <>
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+{
+ return wrapper::vpow(a, b);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
+ const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
+ }
+ return x;
+}
+
+template <ArithmeticOperation op, typename ScalarType, typename VectorType>
+inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
+ const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+ wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+ }
+ return x;
+}
+
+template <ArithmeticOperation op, typename VectorType>
+void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ using scalar_type = typename VectorType::scalar_type;
+
+ elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
+ &elementwise_arithm_op_scalar<op, scalar_type>,
+ &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+ &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType>
+inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
+{
+ bool res = false;
+
+ switch(op)
+ {
+ case ComparisonOperation::Equal:
+ res = (a == b);
+ break;
+ case ComparisonOperation::NotEqual:
+ res = (a != b);
+ break;
+ case ComparisonOperation::Greater:
+ res = (a > b);
+ break;
+ case ComparisonOperation::GreaterEqual:
+ res = (a >= b);
+ break;
+ case ComparisonOperation::Less:
+ res = (a < b);
+ break;
+ case ComparisonOperation::LessEqual:
+ res = (a <= b);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+ return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
+}
+
+template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
+{
+ OutputVectorType res = { 0, 0, 0, 0 };
+
+ switch(op)
+ {
+ case ComparisonOperation::Equal:
+ res = wrapper::vceq(a, b);
+ break;
+ case ComparisonOperation::NotEqual:
+ res = wrapper::vnot(wrapper::vceq(a, b));
+ break;
+ case ComparisonOperation::Greater:
+ res = wrapper::vcgt(a, b);
+ break;
+ case ComparisonOperation::GreaterEqual:
+ res = wrapper::vcge(a, b);
+ break;
+ case ComparisonOperation::Less:
+ res = wrapper::vcgt(b, a);
+ break;
+ case ComparisonOperation::LessEqual:
+ res = wrapper::vcge(b, a);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+ }
+
+ return res;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
+inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+{
+ InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+ return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ wrapper::vstore(output_ptr + x, a);
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+ const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
+ }
+ if(x <= window_end_x - 4)
+ {
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ for(int i = 0; i < 4; i++)
+ {
+ *(output_ptr + x + i) = wrapper::vgetlane(a, i);
+ }
+ x = +4;
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
+ wrapper::vstore(output_ptr + x, res);
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
+ const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ auto a = wrapper::vloadq(input1_ptr + x);
+ auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+ a = wrapper::vloadq(input1_ptr + x + 4);
+ b = wrapper::vloadq(input2_ptr + x + 4);
+ const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+ wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
+ }
+ if(x <= window_end_x - 4)
+ {
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
+ for(int i = 0; i < 4; i++)
+ {
+ *(output_ptr + x + i) = wrapper::vgetlane(res, i);
+ }
+ x = +4;
+ }
+ return x;
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+ &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+ &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+ &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+}
+
+inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
{
qasymm8x16_t x = vld1q_u8(input1_ptr);
const float32x4x4_t out =
@@ -45,7 +493,7 @@ float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
return out;
}
-float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
{
qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
const float32x4x4_t out =
@@ -60,21 +508,21 @@ float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &o
return out;
}
-void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
+inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
{
const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
vst1q_u8(output_ptr, vcombine_u8(pa, pb));
}
-void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
+inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
{
const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
vst1q_u8(output_ptr, vcombine_u8(pa, pb));
}
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
{
int32x4x4_t out =
{
@@ -88,14 +536,14 @@ void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32
store_quantized(output_ptr, out);
}
-void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
+inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
{
const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
vst1q_s8(output_ptr, vcombine_s8(pa, pb));
}
-void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
{
int32x4x4_t out =
{
@@ -122,7 +570,7 @@ inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, cons
}
template <ArithmeticOperation op>
-inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
+float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
{
using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
float32x4x4_t out =
@@ -296,13 +744,13 @@ inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_
return x;
}
-void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
+inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+ int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
+ float32x4_t, float32x4_t, const bool),
+ int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
+ int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+ float32x4_t, float32x4_t))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -404,13 +852,13 @@ void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *o
}
}
-void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
+inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+ int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
+ float32x4_t, float32x4_t, const bool),
+ int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
+ int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+ float32x4_t, float32x4_t))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -511,13 +959,13 @@ void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, I
}
}
-void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
+inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+ int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+ int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
+ float32x4_t, float32x4_t, const bool),
+ int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
+ int32x4_t, int32x4_t, float32x4_t, float32x4_t,
+ float32x4_t, float32x4_t))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -625,6 +1073,7 @@ void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITe
&elementwise_arithm_op_quantized_broadcast_loop<op>,
&elementwise_arithm_op_quantized_loop<op>);
}
+
template <ArithmeticOperation op>
void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
@@ -651,4 +1100,4 @@ void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
+#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
new file mode 100644
index 0000000000..c5c528d3f3
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
+}
+
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ArithmeticOperation op>
+void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
+}
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
+}
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
+}
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
+}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..fa8e08745a
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_quantized<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..abfdf93b75
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
+}
+
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..d764f56623
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithmetic_op<op, float16_t>(in1, in2, out, window);
+}
+
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_op<op, float16_t>(in1, in2, out, window);
+}
+
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..bb33fd2814
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithmetic_op<op, float32_t>(in1, in2, out, window);
+}
+
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_op<op, float>(in1, in2, out, window);
+}
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise/sve/elementwise.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
index 2f9a7998df..b3046e90a8 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,10 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+#include "src/core/NEON/SVEMath.h"
#include <arm_sve.h>
namespace arm_compute
@@ -209,6 +208,41 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
&arithmetic_op_loop<ScalarType, ScalarType>,
&arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
}
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::ADD, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SUB, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -222,6 +256,41 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
&comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
}
+template void elementwise_comparison_op<ComparisonOperation::Equal, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
template <>
svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
{
@@ -241,71 +310,6 @@ svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svi
ARM_COMPUTE_ERROR("Not supported");
}
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_list.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
index f762587ce7..b7425c8626 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_list.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,16 +24,10 @@
#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
#if defined(ARM_COMPUTE_ENABLE_SVE)
+
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
namespace arm_compute
{
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
new file mode 100644
index 0000000000..a4f4d0fc82
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithmetic_op<op, int32_t>(in1, in2, out, window);
+}
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ArithmeticOperation op>
+void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithmetic_op<op, int16_t>(in1, in2, out, window);
+}
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_op<op, uint8_t>(in1, in2, out, window);
+}
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_op<op, int16_t>(in1, in2, out, window);
+}
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_op<op, int32_t>(in1, in2, out, window);
+}
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
index a5d17a86a7..c35ca2d6c3 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,7 @@
#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
#if defined(ARM_COMPUTE_ENABLE_SVE2)
-
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/cpu/kernels/elementwise/sve/elementwise_list.h"
-
+#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
@@ -66,7 +63,7 @@ struct BroadcastQuantizedLoopArguments
const svfloat32_t &out_scale;
};
-svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
+inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
{
auto x = svld1(pg, ptr);
@@ -85,7 +82,7 @@ svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &of
svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
}
-svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
+inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
{
auto x = svld1(pg, ptr);
@@ -106,7 +103,7 @@ svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &o
svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
}
-void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
{
const auto quantized = svcreate4(
svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
@@ -120,7 +117,7 @@ void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint3
svst1(pg, ptr, narrowed);
}
-void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
{
const auto quantized = svcreate4(
svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
new file mode 100644
index 0000000000..63c75c3d4d
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithmetic_quantized_op<op, uint8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_quantized_op<op, uint8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE2)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..fe332df386
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <ArithmeticOperation op>
+void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_arithmetic_quantized_op<op, int8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template <ComparisonOperation op>
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+ return elementwise_comparison_quantized_op<op, int8_t>(in1, in2, out, window);
+}
+
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE2)
diff --git a/src/cpu/kernels/elementwise_binary/list.h b/src/cpu/kernels/elementwise_binary/list.h
new file mode 100644
index 0000000000..78a098e7bb
--- /dev/null
+++ b/src/cpu/kernels/elementwise_binary/list.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+#define SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ELEMETWISE_BINARY_KERNEL(func_name) \
+ template <ArithmeticOperation op> \
+ void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp32_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s16_elementwise_binary);
+DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s32_elementwise_binary);
+
+#undef DECLARE_ELEMETWISE_BINARY_KERNEL
+
+#define DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(func_name) \
+ template <ComparisonOperation op> \
+ void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_u8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_u8_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s16_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s32_comparison_elementwise_binary);
+DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp32_comparison_elementwise_binary);
+#undef DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H \ No newline at end of file
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..976d006f11
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+ return elementwise_op<__fp16>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..21f4d9d326
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+ return elementwise_op<float>(in, out, window, op);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index 307e95fae9..fd930ae7cf 100644
--- a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
new file mode 100644
index 0000000000..ef3120e206
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+ return elementwise_op<int32_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
new file mode 100644
index 0000000000..4dd4a1905c
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+ return elementwise_sve_op<float16_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
new file mode 100644
index 0000000000..3498a0b1ea
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+ return elementwise_sve_op<float32_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index ddf1febd66..0c04a56be0 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,15 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
namespace arm_compute
{
@@ -108,6 +103,7 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E
template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
+#endif //defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h
index c2b495f27c..08f4438696 100644
--- a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,10 +23,7 @@
*/
#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
#if defined(ARM_COMPUTE_ENABLE_SVE)
-
namespace arm_compute
{
namespace cpu
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
new file mode 100644
index 0000000000..c3e3adfc9e
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+ return elementwise_sve_op<int32_t>(in, out, window, op);
+}
+}
+} // namespace arm_compute
+#endif //ARM_COMPUTE_ENABLE_SVE
diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h
new file mode 100644
index 0000000000..2a41b74c51
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/list.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+#define SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H
+
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ELEMETWISE_UNARY_KERNEL(func_name) \
+ void func_name(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp16_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(sve_s32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary);
+
+#undef DECLARE_ELEMETWISE_UNARY_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H \ No newline at end of file