From 9b921be1ff7283050eb39d9ce1b10b5c8bfc1300 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Thu, 28 Jul 2022 17:44:00 +0100 Subject: Optimize add layer by considering the input tensors as 1D array Resolves: COMPMID-5108 Change-Id: I544f8160fbe5b4ffbef348d1fbd3dd626a6e1bdb Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8002 Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins Tested-by: Arm Jenkins --- src/cpu/kernels/add/generic/neon/fp16.cpp | 7 ++++- src/cpu/kernels/add/generic/neon/fp32.cpp | 7 ++++- src/cpu/kernels/add/generic/neon/impl.cpp | 40 +++++++++++++++++++++++++++- src/cpu/kernels/add/generic/neon/impl.h | 5 +++- src/cpu/kernels/add/generic/neon/integer.cpp | 17 +++++++++++- 5 files changed, 71 insertions(+), 5 deletions(-) (limited to 'src/cpu/kernels/add/generic') diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp index 12d4a467b7..bb6636af1e 100644 --- a/src/cpu/kernels/add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/add/generic/neon/fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -33,6 +33,11 @@ void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } + +void add_fp16_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} } } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp index 3563162fce..1d313a191d 100644 --- a/src/cpu/kernels/add/generic/neon/fp32.cpp +++ b/src/cpu/kernels/add/generic/neon/fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,5 +32,10 @@ void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } + +void add_fp32_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} } } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp index ad3e445ab0..67985c985e 100644 --- a/src/cpu/kernels/add/generic/neon/impl.cpp +++ b/src/cpu/kernels/add/generic/neon/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -128,6 +128,35 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const } } +template +void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + const ScalarType *src0_ptr = reinterpret_cast(src0->buffer()); + const ScalarType *src1_ptr = reinterpret_cast(src1->buffer()); + ScalarType *dst_ptr = reinterpret_cast(dst->buffer()); + + constexpr int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(src0_ptr + x); + const auto val2 = wrapper::vloadq(src1_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(dst_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(src0_ptr + x); + const auto val2 = *(src1_ptr + x); + *(dst_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } +} + template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); @@ -137,5 +166,14 @@ template void add_same_neon(const ITensor *src0, const ITensor *src1, I template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +template void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h index 07afdda225..f8f0f517b0 100644 --- a/src/cpu/kernels/add/generic/neon/impl.h +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,9 @@ namespace cpu { template void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +template +void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H \ No newline at end of file diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp index 62c19e66b1..ffead03474 100644 --- a/src/cpu/kernels/add/generic/neon/integer.cpp +++ b/src/cpu/kernels/add/generic/neon/integer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,5 +42,20 @@ void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const { return add_same_neon(src0, src1, dst, policy, window); } + +void add_u8_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} + +void add_s16_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} + +void add_s32_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon_as_1d_array(src0, src1, dst, policy, window); +} } } // namespace arm_compute -- cgit v1.2.1