From 9fc0b5c484e0f6cfe52009719ebccc179ada1112 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Thu, 20 Oct 2022 11:18:17 +0100 Subject: Update reinterpret tensor as 1D for CPU add * Use the same implementation as other layers. Resolves: COMPMID-5108 Signed-off-by: Viet-Hoa Do Change-Id: I5a50259b398b71ca1f61b5ee8daa539bf8263fac Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8501 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins --- src/cpu/kernels/CpuAddKernel.cpp | 102 +++++---------------------- src/cpu/kernels/CpuAddKernel.h | 6 +- src/cpu/kernels/CpuKernelSelectionTypes.h | 1 - src/cpu/kernels/add/generic/neon/fp16.cpp | 5 -- src/cpu/kernels/add/generic/neon/fp32.cpp | 5 -- src/cpu/kernels/add/generic/neon/impl.cpp | 38 ---------- src/cpu/kernels/add/generic/neon/impl.h | 3 - src/cpu/kernels/add/generic/neon/integer.cpp | 15 ---- src/cpu/kernels/add/list.h | 5 -- src/cpu/operators/CpuAdd.cpp | 11 +-- 10 files changed, 25 insertions(+), 166 deletions(-) (limited to 'src/cpu') diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index 47ff6abf17..1648a46cdc 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -39,11 +39,6 @@ namespace cpu { namespace kernels { -bool can_interpret_inputs_as_1d_array(const ITensorInfo &src0, const ITensorInfo &src1) -{ - return !src0.has_padding() && !src1.has_padding() && src0.tensor_shape() == src1.tensor_shape() && src0.strides_in_bytes() == src1.strides_in_bytes(); -} - namespace { static const std::vector available_kernels = @@ -64,51 +59,11 @@ static const std::vector available_kernels = }, REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint) }, - { - "neon_fp32_add_as_1d_array", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F32) && data.can_interpret_inputs_as_1d_array == true; - }, - REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon_as_1d_array) - }, - { - "neon_fp16_add_as_1d_array", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.can_interpret_inputs_as_1d_array == true; - }, - REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon_as_1d_array) - }, - { - "neon_u8_add_as_1d_array", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::U8) && data.can_interpret_inputs_as_1d_array == true; - }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon_as_1d_array) - }, - { - "neon_s16_add_as_1d_array", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::S16) && data.can_interpret_inputs_as_1d_array == true; - }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon_as_1d_array) - }, - { - "neon_s32_add_as_1d_array", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::S32) && data.can_interpret_inputs_as_1d_array == true; - }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon_as_1d_array) - }, { "sve2_qu8_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::QASYMM8) && data.isa.sve2 && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2) }, @@ -116,7 +71,7 @@ static const std::vector available_kernels = "sve2_qs8_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2 && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2) }, @@ -124,7 +79,7 @@ static const std::vector available_kernels = "sve2_qs16_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::QSYMM16) && data.isa.sve2 && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::QSYMM16) && data.isa.sve2; }, REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2) }, @@ -132,7 +87,7 @@ static const std::vector available_kernels = "sve_fp32_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::F32) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::F32) && data.isa.sve; }, REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve) }, @@ -140,7 +95,7 @@ static const std::vector available_kernels = "sve_fp16_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16 && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve) }, @@ -148,7 +103,7 @@ static const std::vector available_kernels = "sve_u8_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::U8) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::U8) && data.isa.sve; }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve) }, @@ -156,7 +111,7 @@ static const std::vector available_kernels = "sve_s16_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::S16) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::S16) && data.isa.sve; }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve) }, @@ -164,7 +119,7 @@ static const std::vector available_kernels = "sve_s32_add", [](const CpuAddKernelDataTypeISASelectorData & data) { - return (data.dt == DataType::S32) && data.isa.sve && data.can_interpret_inputs_as_1d_array == false; + return (data.dt == DataType::S32) && data.isa.sve; }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve) }, @@ -240,34 +195,11 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst); const auto uk = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ src0.data_type(), - CPUInfo::get().get_isa(), can_interpret_inputs_as_1d_array(src0, src1), can_use_fixedpoint }); + CPUInfo::get().get_isa(), can_use_fixedpoint }); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; } - -std::pair validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst) -{ - if(can_interpret_inputs_as_1d_array(src0, src1)) - { - Window window; - window.set(0, Window::Dimension(0, src0.tensor_shape().total_size())); - return std::make_pair(Status{}, window); - } - else - { - const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); - - // Auto initialize dst if not initialized - set_shape_if_empty(dst, out_shape); - set_data_type_if_unknown(dst, src0.data_type()); - - Window win = calculate_max_window(out_shape, Steps()); - - // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped - return std::make_pair(Status{}, win); - } -} } // namespace void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) @@ -275,10 +207,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - _can_interpret_inputs_as_1d_array = can_interpret_inputs_as_1d_array(*src0, *src1); const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); const auto uk = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{ src0->data_type(), - CPUInfo::get().get_isa(), _can_interpret_inputs_as_1d_array, can_use_fixedpoint }); + CPUInfo::get().get_isa(), can_use_fixedpoint }); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -286,10 +217,16 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I _run_method = uk->ukernel; _name = std::string("CpuAddKernel").append("/").append(uk->name); + // Auto initialize dst if not initialized + const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape()); + set_shape_if_empty(*dst, out_shape); + set_data_type_if_unknown(*dst, src0->data_type()); + // Configure kernel window - auto win_config = validate_and_configure_window(*src0, *src1, *dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICpuKernel::configure(win_config.second); + Window win; + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src0, *src1); + + ICpuKernel::configure(win); } Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) @@ -297,7 +234,6 @@ Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first); return Status{}; } diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h index 1afbc1a4d0..e2062c8c33 100644 --- a/src/cpu/kernels/CpuAddKernel.h +++ b/src/cpu/kernels/CpuAddKernel.h @@ -91,16 +91,16 @@ public: static const std::vector &get_available_kernels(); - bool get_can_interpret_inputs_as_1d_array() + size_t get_split_dimension() const { - return _can_interpret_inputs_as_1d_array; + return _split_dimension; } private: ConvertPolicy _policy{}; AddKernelPtr _run_method{ nullptr }; std::string _name{}; - bool _can_interpret_inputs_as_1d_array{ false }; + size_t _split_dimension{ Window::DimY }; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 87edb15192..4b2481074a 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -87,7 +87,6 @@ struct CpuAddKernelDataTypeISASelectorData { DataType dt; cpuinfo::CpuIsaInfo isa; - bool can_interpret_inputs_as_1d_array; bool can_use_fixedpoint; }; diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp index bb6636af1e..1e3bc3c986 100644 --- a/src/cpu/kernels/add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/add/generic/neon/fp16.cpp @@ -33,11 +33,6 @@ void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } - -void add_fp16_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - return add_same_neon_as_1d_array(src0, src1, dst, policy, window); -} } } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp index 1d313a191d..1f599b1968 100644 --- a/src/cpu/kernels/add/generic/neon/fp32.cpp +++ b/src/cpu/kernels/add/generic/neon/fp32.cpp @@ -32,10 +32,5 @@ void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } - -void add_fp32_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - return add_same_neon_as_1d_array(src0, src1, dst, policy, window); -} } } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp index 0f7b31c754..1a0b44fa8c 100644 --- a/src/cpu/kernels/add/generic/neon/impl.cpp +++ b/src/cpu/kernels/add/generic/neon/impl.cpp @@ -128,35 +128,6 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const } } -template -void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - const ScalarType *src0_ptr = reinterpret_cast(src0->buffer()); - const ScalarType *src1_ptr = reinterpret_cast(src1->buffer()); - ScalarType *dst_ptr = reinterpret_cast(dst->buffer()); - - constexpr int window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(src0_ptr + x); - const auto val2 = wrapper::vloadq(src1_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); - wrapper::vstore(dst_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto val1 = *(src0_ptr + x); - const auto val2 = *(src1_ptr + x); - *(dst_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; - } -} - bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { const auto iq0 = src0->quantization_info().uniform(); @@ -383,15 +354,6 @@ template void add_same_neon(const ITensor *src0, const ITensor *src1, I template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ -template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ - template void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h index e6a12fb4c0..91f347ff9c 100644 --- a/src/cpu/kernels/add/generic/neon/impl.h +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -33,9 +33,6 @@ namespace cpu template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template -void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); - bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); template diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp index ffead03474..5698d6d552 100644 --- a/src/cpu/kernels/add/generic/neon/integer.cpp +++ b/src/cpu/kernels/add/generic/neon/integer.cpp @@ -42,20 +42,5 @@ void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } - -void add_u8_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - return add_same_neon_as_1d_array(src0, src1, dst, policy, window); -} - -void add_s16_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - return add_same_neon_as_1d_array(src0, src1, dst, policy, window); -} - -void add_s32_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - return add_same_neon_as_1d_array(src0, src1, dst, policy, window); -} } } // namespace arm_compute diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h index 0285b231e0..7cdb70fd9e 100644 --- a/src/cpu/kernels/add/list.h +++ b/src/cpu/kernels/add/list.h @@ -38,15 +38,10 @@ DECLARE_ADD_KERNEL(add_qasymm8_neon); DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); DECLARE_ADD_KERNEL(add_qsymm16_neon); DECLARE_ADD_KERNEL(add_fp32_neon); -DECLARE_ADD_KERNEL(add_fp32_neon_as_1d_array); DECLARE_ADD_KERNEL(add_fp16_neon); -DECLARE_ADD_KERNEL(add_fp16_neon_as_1d_array); DECLARE_ADD_KERNEL(add_u8_neon); -DECLARE_ADD_KERNEL(add_u8_neon_as_1d_array); DECLARE_ADD_KERNEL(add_s16_neon); -DECLARE_ADD_KERNEL(add_s16_neon_as_1d_array); DECLARE_ADD_KERNEL(add_s32_neon); -DECLARE_ADD_KERNEL(add_s32_neon_as_1d_array); DECLARE_ADD_KERNEL(add_fp32_sve); DECLARE_ADD_KERNEL(add_fp16_sve); DECLARE_ADD_KERNEL(add_u8_sve); diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp index 828361e7cf..41def8e22f 100644 --- a/src/cpu/operators/CpuAdd.cpp +++ b/src/cpu/operators/CpuAdd.cpp @@ -50,14 +50,9 @@ Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const void CpuAdd::run(ITensorPack &tensors) { - if(static_cast(_kernel.get())->get_can_interpret_inputs_as_1d_array()) - { - NEScheduler::get().schedule_op(_kernel.get(), Window::DimX, _kernel->window(), tensors); - } - else - { - ICpuOperator::run(tensors); - } + const auto split_dimension = static_cast(_kernel.get())->get_split_dimension(); + + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } } // namespace cpu } // namespace arm_compute -- cgit v1.2.1