From 9b921be1ff7283050eb39d9ce1b10b5c8bfc1300 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Thu, 28 Jul 2022 17:44:00 +0100 Subject: Optimize add layer by considering the input tensors as 1D array Resolves: COMPMID-5108 Change-Id: I544f8160fbe5b4ffbef348d1fbd3dd626a6e1bdb Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8002 Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins Tested-by: Arm Jenkins --- src/cpu/kernels/CpuAddKernel.cpp | 123 ++++++++++++++++++++------- src/cpu/kernels/CpuAddKernel.h | 12 ++- src/cpu/kernels/CpuKernelSelectionTypes.h | 8 ++ src/cpu/kernels/add/generic/neon/fp16.cpp | 7 +- src/cpu/kernels/add/generic/neon/fp32.cpp | 7 +- src/cpu/kernels/add/generic/neon/impl.cpp | 40 ++++++++- src/cpu/kernels/add/generic/neon/impl.h | 5 +- src/cpu/kernels/add/generic/neon/integer.cpp | 17 +++- src/cpu/kernels/add/list.h | 7 +- src/cpu/operators/CpuAdd.cpp | 16 +++- src/cpu/operators/CpuAdd.h | 5 +- tests/validation/NEON/ArithmeticAddition.cpp | 31 +++++-- 12 files changed, 226 insertions(+), 52 deletions(-) diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index e756effea9..85ae410a94 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -39,82 +39,127 @@ namespace cpu { namespace kernels { +bool can_interpret_inputs_as_1d_array(const ITensorInfo &src0, const ITensorInfo &src1) +{ + return !src0.has_padding() && !src1.has_padding() && src0.tensor_shape() == src1.tensor_shape(); +} + namespace { static const std::vector available_kernels = { + { + "neon_fp32_add_as_1d_array", + [](const CpuAddKernelDataTypeISASelectorData & data) + { + return (data.dt == DataType::F32) && data.can_interpret_inputs_as_1d_array == true; + }, + REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon_as_1d_array) + }, + { + "neon_fp16_add_as_1d_array", + [](const CpuAddKernelDataTypeISASelectorData & data) + { + return (data.dt == DataType::F16) && data.can_interpret_inputs_as_1d_array == true; + }, + REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon_as_1d_array) + }, + { + "neon_u8_add_as_1d_array", + [](const CpuAddKernelDataTypeISASelectorData & data) + { + return (data.dt == DataType::U8) && data.can_interpret_inputs_as_1d_array == true; + }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon_as_1d_array) + }, + { + "neon_s16_add_as_1d_array", + [](const CpuAddKernelDataTypeISASelectorData & data) + { + return (data.dt == DataType::S16) && data.can_interpret_inputs_as_1d_array == true; + }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon_as_1d_array) + }, + { + "neon_s32_add_as_1d_array", + [](const CpuAddKernelDataTypeISASelectorData & data) + { + return (data.dt == DataType::S32) && data.can_interpret_inputs_as_1d_array == true; + }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon_as_1d_array) + }, { "sve2_qu8_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::QASYMM8) && data.isa.sve2; + return (data.dt == DataType::QASYMM8) && data.isa.sve2 && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2) }, { "sve2_qs8_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; + return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2 && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2) }, { "sve2_qs16_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::QSYMM16) && data.isa.sve2; + return (data.dt == DataType::QSYMM16) && data.isa.sve2 && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2) }, { "sve_fp32_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::F32) && data.isa.sve; + return (data.dt == DataType::F32) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve) }, { "sve_fp16_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; + return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16 && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve) }, { "sve_u8_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::U8) && data.isa.sve; + return (data.dt == DataType::U8) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve) }, { "sve_s16_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::S16) && data.isa.sve; + return (data.dt == DataType::S16) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve) }, { "sve_s32_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::S32) && data.isa.sve; + return (data.dt == DataType::S32) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve) }, { "neon_fp32_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon) }, { "neon_fp16_add", - [](const DataTypeISASelectorData & data) + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, @@ -122,32 +167,32 @@ static const std::vector available_kernels = }, { "neon_u8_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::U8); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon) }, { "neon_s16_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::S16); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon) }, { "neon_s32_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::S32); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon) }, { "neon_qu8_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) }, { "neon_qs8_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) }, { "neon_qs16_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); }, + [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) } }; @@ -177,7 +222,8 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons "Wrong shape for dst"); } - const auto *uk = CpuAddKernel::get_implementation(DataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ src0.data_type(), + CPUInfo::get().get_isa(), can_interpret_inputs_as_1d_array(src0, src1) }); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -185,16 +231,25 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons std::pair validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst) { - const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + if(can_interpret_inputs_as_1d_array(src0, src1)) + { + Window window; + window.set(0, Window::Dimension(0, src0.tensor_shape().total_size())); + return std::make_pair(Status{}, window); + } + else + { + const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); - // Auto initialize dst if not initialized - set_shape_if_empty(dst, out_shape); - set_data_type_if_unknown(dst, src0.data_type()); + // Auto initialize dst if not initialized + set_shape_if_empty(dst, out_shape); + set_data_type_if_unknown(dst, src0.data_type()); - Window win = calculate_max_window(out_shape, Steps()); + Window win = calculate_max_window(out_shape, Steps()); - // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped - return std::make_pair(Status{}, win); + // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped + return std::make_pair(Status{}, win); + } } } // namespace @@ -203,7 +258,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - const auto uk = CpuAddKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() }); + _can_interpret_inputs_as_1d_array = can_interpret_inputs_as_1d_array(*src0, *src1); + const auto uk = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ src0->data_type(), + CPUInfo::get().get_isa(), _can_interpret_inputs_as_1d_array }); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h index 6638135580..1afbc1a4d0 100644 --- a/src/cpu/kernels/CpuAddKernel.h +++ b/src/cpu/kernels/CpuAddKernel.h @@ -42,9 +42,9 @@ private: public: struct AddKernel { - const char *name; - const DataTypeISASelectorPtr is_selected; - AddKernelPtr ukernel; + const char *name; + const CpuAddKernelDataTypeISASelectorDataPtr is_selected; + AddKernelPtr ukernel; }; CpuAddKernel() = default; @@ -91,10 +91,16 @@ public: static const std::vector &get_available_kernels(); + bool get_can_interpret_inputs_as_1d_array() + { + return _can_interpret_inputs_as_1d_array; + } + private: ConvertPolicy _policy{}; AddKernelPtr _run_method{ nullptr }; std::string _name{}; + bool _can_interpret_inputs_as_1d_array{ false }; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 12542e5064..19c41f9fcd 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -83,6 +83,13 @@ struct ActivationDataTypeISASelectorData ActivationLayerInfo::ActivationFunction f; }; +struct CpuAddKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + bool can_interpret_inputs_as_1d_array; +}; + // Selector pointer types using DataTypeISASelectorPtr = std::add_pointer::type; using DataTypeDataLayoutSelectorPtr = std::add_pointer::type; @@ -91,6 +98,7 @@ using ElementwiseDataTypeISASelectorPtr = std::add_pointer::type; using CastDataTypeISASelectorDataPtr = std::add_pointer::type; using ActivationDataTypeISASelectorDataPtr = std::add_pointer::type; +using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer::type; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp index 12d4a467b7..bb6636af1e 100644 --- a/src/cpu/kernels/add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/add/generic/neon/fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -33,6 +33,11 @@ void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } + +void add_fp16_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} } } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp index 3563162fce..1d313a191d 100644 --- a/src/cpu/kernels/add/generic/neon/fp32.cpp +++ b/src/cpu/kernels/add/generic/neon/fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,5 +32,10 @@ void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } + +void add_fp32_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} } } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp index ad3e445ab0..67985c985e 100644 --- a/src/cpu/kernels/add/generic/neon/impl.cpp +++ b/src/cpu/kernels/add/generic/neon/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -128,6 +128,35 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const } } +template +void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + const ScalarType *src0_ptr = reinterpret_cast(src0->buffer()); + const ScalarType *src1_ptr = reinterpret_cast(src1->buffer()); + ScalarType *dst_ptr = reinterpret_cast(dst->buffer()); + + constexpr int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(src0_ptr + x); + const auto val2 = wrapper::vloadq(src1_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(dst_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(src0_ptr + x); + const auto val2 = *(src1_ptr + x); + *(dst_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } +} + template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); @@ -137,5 +166,14 @@ template void add_same_neon(const ITensor *src0, const ITensor *src1, I template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h index 07afdda225..f8f0f517b0 100644 --- a/src/cpu/kernels/add/generic/neon/impl.h +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,9 @@ namespace cpu { template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +template +void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H \ No newline at end of file diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp index 62c19e66b1..ffead03474 100644 --- a/src/cpu/kernels/add/generic/neon/integer.cpp +++ b/src/cpu/kernels/add/generic/neon/integer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,5 +42,20 @@ void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } + +void add_u8_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} + +void add_s16_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} + +void add_s32_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} } } // namespace arm_compute diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h index 9d7c9a67ff..0285b231e0 100644 --- a/src/cpu/kernels/add/list.h +++ b/src/cpu/kernels/add/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -38,10 +38,15 @@ DECLARE_ADD_KERNEL(add_qasymm8_neon); DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); DECLARE_ADD_KERNEL(add_qsymm16_neon); DECLARE_ADD_KERNEL(add_fp32_neon); +DECLARE_ADD_KERNEL(add_fp32_neon_as_1d_array); DECLARE_ADD_KERNEL(add_fp16_neon); +DECLARE_ADD_KERNEL(add_fp16_neon_as_1d_array); DECLARE_ADD_KERNEL(add_u8_neon); +DECLARE_ADD_KERNEL(add_u8_neon_as_1d_array); DECLARE_ADD_KERNEL(add_s16_neon); +DECLARE_ADD_KERNEL(add_s16_neon_as_1d_array); DECLARE_ADD_KERNEL(add_s32_neon); +DECLARE_ADD_KERNEL(add_s32_neon_as_1d_array); DECLARE_ADD_KERNEL(add_fp32_sve); DECLARE_ADD_KERNEL(add_fp16_sve); DECLARE_ADD_KERNEL(add_u8_sve); diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp index 76ec7d7d8d..828361e7cf 100644 --- a/src/cpu/operators/CpuAdd.cpp +++ b/src/cpu/operators/CpuAdd.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,8 @@ #include "src/common/utils/Log.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + namespace arm_compute { namespace cpu @@ -45,5 +47,17 @@ Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuAddKernel::validate(src0, src1, dst, policy); } + +void CpuAdd::run(ITensorPack &tensors) +{ + if(static_cast(_kernel.get())->get_can_interpret_inputs_as_1d_array()) + { + NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors); + } + else + { + ICpuOperator::run(tensors); + } +} } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h index d8ec620aeb..4ad6d7fe65 100644 --- a/src/cpu/operators/CpuAdd.h +++ b/src/cpu/operators/CpuAdd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,6 +62,9 @@ public: * @return a status */ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; }; } // namespace cpu } // namespace arm_compute diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp index c72e082a74..f94e329c9c 100644 --- a/tests/validation/NEON/ArithmeticAddition.cpp +++ b/tests/validation/NEON/ArithmeticAddition.cpp @@ -89,7 +89,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( } DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), + combine(combine(framework::dataset::make("CpuExt", std::string("NEON")), framework::dataset::make("DataType", { DataType::F32, DataType::F16, DataType::U8, @@ -99,19 +99,22 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat( DataType::QASYMM8_SIGNED, DataType::QSYMM16 })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), + framework::dataset::make("CanInterpretAs1D", {true, false})), + combine(combine(framework::dataset::make("CpuExt", std::string("SVE")), framework::dataset::make("DataType", { DataType::F32, DataType::F16, DataType::U8, DataType::S16, DataType::S32 - }))), - combine(framework::dataset::make("CpuExt", std::string("SVE2")), + })), + framework::dataset::make("CanInterpretAs1D", {true, false}))), + combine(combine(framework::dataset::make("CpuExt", std::string("SVE2")), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16 - }))), - cpu_ext, data_type) + })), + framework::dataset::make("CanInterpretAs1D", {false}))), + cpu_ext, data_type, can_interpret_inputs_as_1d_array) { using namespace cpu::kernels; @@ -121,11 +124,23 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat( cpu_isa.sve2 = (cpu_ext == "SVE2"); cpu_isa.fp16 = (data_type == DataType::F16); - const auto *selected_impl = CpuAddKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred); + const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_interpret_inputs_as_1d_array}, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_add"; + bool float_or_integer = (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || + data_type == DataType::S16 || data_type == DataType::S32); + + std::string expected; + if(can_interpret_inputs_as_1d_array && float_or_integer) + { + expected = "neon_" + cpu_impl_dt(data_type) + "_add_as_1d_array"; + } + else + { + expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_add"; + } + std::string actual = selected_impl->name; ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); -- cgit v1.2.1