From da816752cad76c8e1b367e8e9c648994a1af599a Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 2 Jul 2021 09:22:14 +0100 Subject: Remove redundant implementations of Add/Sub operators Allows only implementations where inputs/output are of the same data type and removes legacy Computer Vision ones. Signed-off-by: Georgios Pinitas Change-Id: Ia2b3d23a04236aab682f0c36a1110a30f7c06d1c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5900 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- src/core/cpu/kernels/CpuAddKernel.cpp | 141 ++++----------------- src/core/cpu/kernels/CpuAddKernel.h | 3 - src/core/cpu/kernels/CpuSubKernel.cpp | 87 +++---------- src/core/cpu/kernels/CpuSubKernel.h | 3 - src/core/cpu/kernels/add/neon/integer.cpp | 170 ------------------------- src/core/cpu/kernels/add/neon/list.h | 3 - src/core/cpu/kernels/add/sve/integer.cpp | 201 ------------------------------ src/core/cpu/kernels/add/sve/list.h | 3 - src/core/cpu/kernels/sub/neon/integer.cpp | 183 --------------------------- src/core/cpu/kernels/sub/neon/list.h | 3 - 10 files changed, 44 insertions(+), 753 deletions(-) delete mode 100644 src/core/cpu/kernels/add/neon/integer.cpp delete mode 100644 src/core/cpu/kernels/add/sve/integer.cpp delete mode 100644 src/core/cpu/kernels/sub/neon/integer.cpp (limited to 'src/core/cpu') diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp index 12766037a7..61b7b19443 100644 --- a/src/core/cpu/kernels/CpuAddKernel.cpp +++ b/src/core/cpu/kernels/CpuAddKernel.cpp @@ -45,14 +45,7 @@ namespace { struct AddSelectorData { - /* Data types for all ITensorInfos: - dt1 -> src0 - dt2 -> src1 - dt3 -> dst - */ - DataType dt1; - DataType dt2; - DataType dt3; + DataType dt; const CPUInfo &ci; }; @@ -72,7 +65,7 @@ static const AddKernel available_kernels[] = "sve2_qu8_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)) && data.ci.has_sve(); + return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve) }, @@ -80,7 +73,7 @@ static const AddKernel available_kernels[] = "sve2_qs8_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)) && data.ci.has_sve(); + return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve) }, @@ -88,7 +81,7 @@ static const AddKernel available_kernels[] = "sve2_qs16_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)) && data.ci.has_sve(); + return (data.dt == DataType::QSYMM16) && data.ci.has_sve(); }, REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve) }, @@ -98,7 +91,7 @@ static const AddKernel available_kernels[] = "sve_fp32_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)) && data.ci.has_sve(); + return (data.dt == DataType::F32) && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve) }, @@ -106,7 +99,7 @@ static const AddKernel available_kernels[] = "sve_fp16_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_sve(); + return (data.dt == DataType::F16) && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve) }, @@ -114,7 +107,7 @@ static const AddKernel available_kernels[] = "sve_u8_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)) && data.ci.has_sve(); + return (data.dt == DataType::U8) && data.ci.has_sve(); }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) }, @@ -122,7 +115,7 @@ static const AddKernel available_kernels[] = "sve_s16_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)) && data.ci.has_sve(); + return (data.dt == DataType::S16) && data.ci.has_sve(); }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) }, @@ -130,39 +123,15 @@ static const AddKernel available_kernels[] = "sve_s32_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)) && data.ci.has_sve(); + return (data.dt == DataType::S32) && data.ci.has_sve(); }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) }, - { - "sve_u8_s16_s16_add", - [](const AddSelectorData & data) - { - return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)) && data.ci.has_sve(); - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve) - }, - { - "sve_s16_u8_s16_add", - [](const AddSelectorData & data) - { - return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)) && data.ci.has_sve(); - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve) - }, - { - "sve_u8_u8_s16_add", - [](const AddSelectorData & data) - { - return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)) && data.ci.has_sve(); - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve) - }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp32_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) @@ -170,56 +139,41 @@ static const AddKernel available_kernels[] = "neon_fp16_add", [](const AddSelectorData & data) { - return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_fp16(); + return (data.dt == DataType::F16) && data.ci.has_fp16(); }, REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { "neon_u8_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::U8); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) }, { "neon_s16_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::S16); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) }, { "neon_s32_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::S32); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) }, - { - "neon_u8_s16_s16_add", - [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_s16_s16_neon) - }, - { - "neon_s16_u8_s16_add", - [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_u8_s16_neon) - }, - { - "neon_u8_u8_s16_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon) - }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { "neon_qu8_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) }, { "neon_qs8_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) }, { "neon_qs16_add", - [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, + [](const AddSelectorData & data) { return (data.dt == DataType::QSYMM16); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ @@ -231,11 +185,11 @@ static const AddKernel available_kernels[] = * * @return A matching micro-kernel else nullptr */ -const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt1, DataType dt2, DataType dt3) +const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt) { for(const auto &uk : available_kernels) { - if(uk.is_selected({ dt1, dt2, dt3, cpuinfo })) + if(uk.is_selected({ dt, cpuinfo })) { return &uk; } @@ -251,9 +205,7 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); @@ -265,25 +217,12 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons // Validate in case of configured dst if(dst.total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8) - && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32) - && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32) - && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16) - && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8) - && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED) - && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16), - "You called addition with the wrong image formats"); - + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); } - const auto *uk = get_implementation(CPUInfo::get(), src0.data_type(), src1.data_type(), dst.data_type()); + const auto *uk = get_implementation(CPUInfo::get(), src0.data_type()); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -294,38 +233,8 @@ std::pair validate_and_configure_window(const ITensorInfo &src0, const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); // Auto initialize dst if not initialized - { - set_shape_if_empty(dst, out_shape); - - if(src0.data_type() == DataType::S16 || src1.data_type() == DataType::S16) - { - set_format_if_unknown(dst, Format::S16); - } - if(src0.data_type() == DataType::S32 || src1.data_type() == DataType::S32) - { - set_format_if_unknown(dst, Format::S32); - } - else if(src0.data_type() == DataType::F16 || src1.data_type() == DataType::F16) - { - set_format_if_unknown(dst, Format::F16); - } - else if(src0.data_type() == DataType::F32 || src1.data_type() == DataType::F32) - { - set_format_if_unknown(dst, Format::F32); - } - else if(src0.data_type() == DataType::QASYMM8 || src1.data_type() == DataType::QASYMM8) - { - set_data_type_if_unknown(dst, DataType::QASYMM8); - } - else if(src0.data_type() == DataType::QASYMM8_SIGNED || src1.data_type() == DataType::QASYMM8_SIGNED) - { - set_data_type_if_unknown(dst, DataType::QASYMM8_SIGNED); - } - else if(src0.data_type() == DataType::QSYMM16 || src1.data_type() == DataType::QSYMM16) - { - set_data_type_if_unknown(dst, DataType::QSYMM16); - } - } + set_shape_if_empty(dst, out_shape); + set_data_type_if_unknown(dst, src0.data_type()); Window win = calculate_max_window(out_shape, Steps()); @@ -339,7 +248,7 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - const auto uk = get_implementation(CPUInfo::get(), src0->data_type(), src1->data_type(), dst->data_type()); + const auto uk = get_implementation(CPUInfo::get(), src0->data_type()); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _policy = policy; diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h index 717d0132c6..1205b45dfb 100644 --- a/src/core/cpu/kernels/CpuAddKernel.h +++ b/src/core/cpu/kernels/CpuAddKernel.h @@ -44,9 +44,6 @@ public: * Valid configurations (src0,src1) -> dst : * * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 * - (S16,S16) -> S16 * - (S32,S32) -> S32 * - (F16,F16) -> F16 diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp index 098a324377..fa7a55805e 100644 --- a/src/core/cpu/kernels/CpuSubKernel.cpp +++ b/src/core/cpu/kernels/CpuSubKernel.cpp @@ -41,9 +41,7 @@ namespace { struct SubSelectorData { - DataType dt1; - DataType dt2; - DataType dt3; + DataType dt; }; using SubSelectorPtr = std::add_pointer::type; @@ -60,59 +58,44 @@ static const SubKernel available_kernels[] = { { "neon_fp32_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "neon_fp16_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { "neon_u8_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::U8); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) }, { "neon_s16_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::S16); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) }, { "neon_s32_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::S32); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) }, - { - "neon_u8_s16_s16_sub", - [](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon) - }, - { - "neon_s16_u8_s16_sub", - [](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon) - }, - { - "neon_u8_u8_s16_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon) - }, { "neon_qu8_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) }, { "neon_qs8_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); }, + [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) }, { - "neon_s16_sub", - [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, + "neon_qs16_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::QSYMM16); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) }, }; @@ -123,11 +106,11 @@ static const SubKernel available_kernels[] = * * @return A matching micro-kernel else nullptr */ -const SubKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3) +const SubKernel *get_implementation(DataType dt) { for(const auto &uk : available_kernels) { - if(uk.is_selected({ dt1, dt2, dt3 })) + if(uk.is_selected({ dt })) { return &uk; } @@ -141,54 +124,21 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); - const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type()); + const auto *uk = get_implementation(src0.data_type()); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8) - && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8) - && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED) - && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16) - && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8) - && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8) - && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32) - && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32) - && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16), - "You called subtract with the wrong image formats"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP) - || (src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP) - || (src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP), - "Convert policy cannot be WRAP if datatype is quantized"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP), + "Convert policy cannot be WRAP if datatype is quantized"); // Validate in case of configured dst if(dst.total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8) - && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8) - && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED) - && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16) - && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16) - && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32) - && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32) - && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16), - "You called subtract with the wrong image formats"); - + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); } @@ -205,8 +155,9 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I // Auto initialize dst if not initialized set_shape_if_empty(*dst, out_shape); + set_data_type_if_unknown(*dst, src0->data_type()); - const auto *uk = get_implementation(src0->data_type(), src1->data_type(), dst->data_type()); + const auto *uk = get_implementation(src0->data_type()); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _policy = policy; diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h index b9160bd150..cb64e64cfa 100644 --- a/src/core/cpu/kernels/CpuSubKernel.h +++ b/src/core/cpu/kernels/CpuSubKernel.h @@ -45,11 +45,8 @@ public: * Valid configurations (src0,src1) -> dst : * * - (U8,U8) -> U8 - * - (U8,U8) -> S16 * - (QASYMM8, QASYMM8) -> QASYMM8 * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 * - (S16,S16) -> S16 * - (S32,S32) -> S32 * - (F16,F16) -> F16 diff --git a/src/core/cpu/kernels/add/neon/integer.cpp b/src/core/cpu/kernels/add/neon/integer.cpp deleted file mode 100644 index 24a0ac3b7c..0000000000 --- a/src/core/cpu/kernels/add/neon/integer.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void add_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(policy == ConvertPolicy::WRAP) - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast(*(input1_ptr + x)) + static_cast(*(input2_ptr + x)); - } - } - else - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = wrapper::add_sat(static_cast(*(input1_ptr + x)), - static_cast(*(input2_ptr + x))); - } - } - }, - input1, input2, output); -} - -void add_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(policy == ConvertPolicy::WRAP) - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = wrapper::vloadq(input1_ptr + x); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = *(input1_ptr + x) + static_cast(*(input2_ptr + x)); - } - } - else - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = wrapper::vloadq(input1_ptr + x); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast(*(input2_ptr + x))); - } - } - }, - input1, input2, output); -} - -void add_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Simply swap the two input buffers: - add_s16_u8_s16_neon(src1, src0, dst, policy, window); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h index 3ab03dd40e..379bd32fb1 100644 --- a/src/core/cpu/kernels/add/neon/list.h +++ b/src/core/cpu/kernels/add/neon/list.h @@ -38,9 +38,6 @@ namespace cpu DECLARE_ADD_KERNEL(add_qasymm8_neon); DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); DECLARE_ADD_KERNEL(add_qsymm16_neon); -DECLARE_ADD_KERNEL(add_s16_u8_s16_neon); -DECLARE_ADD_KERNEL(add_u8_s16_s16_neon); -DECLARE_ADD_KERNEL(add_u8_u8_s16_neon); #undef DECLARE_ADD_KERNEL diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp deleted file mode 100644 index bd8179205b..0000000000 --- a/src/core/cpu/kernels/add/sve/integer.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void add_u8_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const auto all_true_pg = svptrue_b8(); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(policy == ConvertPolicy::WRAP) - { - int x = window_start_x; - svbool_t pg_u = svwhilelt_b8(x, window_end_x); - svbool_t pg_0 = svwhilelt_b16(x, window_end_x); - svbool_t pg_1 = svwhilelt_b16(x, static_cast(window_end_x + svcnth())); - do - { - const auto vsrc0 = svld1(pg_u, input1_ptr + x); - const auto vsrc1 = svld1(pg_u, input2_ptr + x); - - const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0)); - const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0)); - const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1)); - const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1)); - svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo)); - svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi)); - - x += svcntb(); - pg_u = svwhilelt_b8(x, window_end_x); - pg_0 = svwhilelt_b16(x, window_end_x); - pg_1 = svwhilelt_b16(x, static_cast(window_end_x + svcnth())); - } - while(svptest_any(all_true_pg, pg_u)); - } - else - { - int x = window_start_x; - svbool_t pg_u = svwhilelt_b8(x, window_end_x); - svbool_t pg_0 = svwhilelt_b16(x, window_end_x); - svbool_t pg_1 = svwhilelt_b16(x, static_cast(window_end_x + svcnth())); - do - { - const auto vsrc0 = svld1(pg_u, input1_ptr + x); - const auto vsrc1 = svld1(pg_u, input2_ptr + x); - - const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0)); - const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0)); - const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1)); - const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1)); - svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo)); - svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi)); - - x += svcntb(); - pg_u = svwhilelt_b8(x, window_end_x); - pg_0 = svwhilelt_b16(x, window_end_x); - pg_1 = svwhilelt_b16(x, static_cast(window_end_x + svcnth())); - } - while(svptest_any(all_true_pg, pg_u)); - } - }, - input1, input2, output); -} - -void add_s16_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const auto all_true_pg = svptrue_b8(); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(policy == ConvertPolicy::WRAP) - { - int x = window_start_x; - svbool_t pg_u = svwhilelt_b8(x, window_end_x); - svbool_t pg_0 = svwhilelt_b16(x, window_end_x); - svbool_t pg_1 = svwhilelt_b16(x + static_cast(svcnth()), window_end_x); - do - { - const auto vsrc0_0 = svld1_s16(pg_0, input1_ptr + x); - const auto vsrc0_1 = svld1_s16(pg_1, input1_ptr + x + svcnth()); - const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x); - const auto vsrc1_0 = svreinterpret_s16_u16(svunpklo(vsrc1_u8)); - const auto vsrc1_1 = svreinterpret_s16_u16(svunpkhi(vsrc1_u8)); - svst1_s16(pg_0, output_ptr + x, svadd_s16_z(pg_0, vsrc0_0, vsrc1_0)); - svst1_s16(pg_1, output_ptr + x + svcnth(), svadd_s16_z(pg_1, vsrc0_1, vsrc1_1)); - - x += svcntb(); - pg_u = svwhilelt_b8(x, window_end_x); - pg_0 = svwhilelt_b16(x, window_end_x); - pg_1 = svwhilelt_b16(x + static_cast(svcnth()), window_end_x); - } - while(svptest_any(all_true_pg, pg_u)); - } - else - { - int x = window_start_x; - svbool_t pg_u = svwhilelt_b8(x, window_end_x); - svbool_t pg_0 = svwhilelt_b16(x, window_end_x); - svbool_t pg_1 = svwhilelt_b16(x + static_cast(svcnth()), window_end_x); - do - { - const auto vsrc0_0 = svld1_s16(pg_0, input1_ptr + x); - const auto vsrc0_1 = svld1_s16(pg_1, input1_ptr + x + svcnth()); - const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x); - const auto vsrc1_0 = svreinterpret_s16_u16(svunpklo(vsrc1_u8)); - const auto vsrc1_1 = svreinterpret_s16_u16(svunpkhi(vsrc1_u8)); - - svst1_s16(pg_0, output_ptr + x, svqadd(vsrc0_0, vsrc1_0)); - svst1_s16(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_1, vsrc1_1)); - - x += svcntb(); - pg_u = svwhilelt_b8(x, window_end_x); - pg_0 = svwhilelt_b16(x, window_end_x); - pg_1 = svwhilelt_b16(x + static_cast(svcnth()), window_end_x); - } - while(svptest_any(all_true_pg, pg_u)); - } - }, - input1, input2, output); -} - -void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Simply swap the two input buffers: - add_s16_u8_s16_sve(src1, src0, dst, policy, window); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h index 9e439497c9..4d29c2a8f1 100644 --- a/src/core/cpu/kernels/add/sve/list.h +++ b/src/core/cpu/kernels/add/sve/list.h @@ -42,9 +42,6 @@ namespace cpu DECLARE_ADD_KERNEL(add_qasymm8_sve); DECLARE_ADD_KERNEL(add_qasymm8_signed_sve); DECLARE_ADD_KERNEL(add_qsymm16_sve); -DECLARE_ADD_KERNEL(add_s16_u8_s16_sve); -DECLARE_ADD_KERNEL(add_u8_s16_s16_sve); -DECLARE_ADD_KERNEL(add_u8_u8_s16_sve); #undef DECLARE_ADD_KERNEL diff --git a/src/core/cpu/kernels/sub/neon/integer.cpp b/src/core/cpu/kernels/sub/neon/integer.cpp deleted file mode 100644 index bba73df1e8..0000000000 --- a/src/core/cpu/kernels/sub/neon/integer.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -void sub_s16_u8_s16_impl(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_swapped) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(policy == ConvertPolicy::WRAP) - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = wrapper::vloadq(input1_ptr + x); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto res = is_swapped ? static_cast(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast(*(input2_ptr + x)); - *(output_ptr + x) = res; - } - } - else - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = wrapper::vloadq(input1_ptr + x); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto res = is_swapped ? wrapper::sub_sat(static_cast(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast(*(input2_ptr + x))); - *(output_ptr + x) = res; - } - } - }, - input1, input2, output); -} -} - -void sub_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - sub_s16_u8_s16_impl(src1, src0, dst, policy, window, false); -} - -void sub_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Swap arguments - sub_s16_u8_s16_impl(src1, src0, dst, policy, window, true); -} - -void sub_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(policy == ConvertPolicy::WRAP) - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = static_cast(*(input1_ptr + x)) - static_cast(*(input2_ptr + x)); - } - } - else - { - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); - const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); - wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = wrapper::sub_sat(static_cast(*(input1_ptr + x)), - static_cast(*(input2_ptr + x))); - } - } - }, - input1, input2, output); -} - -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h index 1ab4e6367b..ac1346001a 100644 --- a/src/core/cpu/kernels/sub/neon/list.h +++ b/src/core/cpu/kernels/sub/neon/list.h @@ -38,9 +38,6 @@ namespace cpu DECLARE_SUB_KERNEL(sub_qasymm8_neon); DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon); DECLARE_SUB_KERNEL(sub_qsymm16_neon); -DECLARE_SUB_KERNEL(sub_s16_u8_s16_neon); -DECLARE_SUB_KERNEL(sub_u8_s16_s16_neon); -DECLARE_SUB_KERNEL(sub_u8_u8_s16_neon); #undef DECLARE_SUB_KERNEL -- cgit v1.2.1