diff options
-rw-r--r-- | Android.bp | 20 | ||||
-rw-r--r-- | filelist.json | 20 | ||||
-rw-r--r-- | src/cpu/kernels/CpuActivationKernel.cpp | 30 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/fp16.cpp | 43 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/fp32.cpp | 39 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/impl.h (renamed from src/cpu/kernels/activation/neon/fp32.cpp) | 86 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/qasymm8.cpp (renamed from src/cpu/kernels/activation/neon/qasymm8.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp (renamed from src/cpu/kernels/activation/neon/qasymm8_signed.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/neon/qsymm16.cpp (renamed from src/cpu/kernels/activation/neon/qsymm16.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/sve/fp16.cpp (renamed from src/cpu/kernels/activation/sve/fp16.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/sve/fp32.cpp (renamed from src/cpu/kernels/activation/sve/fp32.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/sve2/qasymm8.cpp (renamed from src/cpu/kernels/activation/sve/qasymm8.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp (renamed from src/cpu/kernels/activation/sve/qasymm8_signed.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/generic/sve2/qsymm16.cpp (renamed from src/cpu/kernels/activation/sve/qsymm16.cpp) | 2 | ||||
-rw-r--r-- | src/cpu/kernels/activation/list.h | 20 | ||||
-rw-r--r-- | src/cpu/kernels/activation/neon/fp16.cpp | 217 |
16 files changed, 176 insertions, 315 deletions
diff --git a/Android.bp b/Android.bp index 5727706c07..5654b840fe 100644 --- a/Android.bp +++ b/Android.bp @@ -429,16 +429,16 @@ cc_library_static { "src/cpu/kernels/CpuTransposeKernel.cpp", "src/cpu/kernels/CpuWeightsReshapeKernel.cpp", "src/cpu/kernels/CpuWinogradConv2dKernel.cpp", - "src/cpu/kernels/activation/neon/fp16.cpp", - "src/cpu/kernels/activation/neon/fp32.cpp", - "src/cpu/kernels/activation/neon/qasymm8.cpp", - "src/cpu/kernels/activation/neon/qasymm8_signed.cpp", - "src/cpu/kernels/activation/neon/qsymm16.cpp", - "src/cpu/kernels/activation/sve/fp16.cpp", - "src/cpu/kernels/activation/sve/fp32.cpp", - "src/cpu/kernels/activation/sve/qasymm8.cpp", - "src/cpu/kernels/activation/sve/qasymm8_signed.cpp", - "src/cpu/kernels/activation/sve/qsymm16.cpp", + "src/cpu/kernels/activation/generic/neon/fp16.cpp", + "src/cpu/kernels/activation/generic/neon/fp32.cpp", + "src/cpu/kernels/activation/generic/neon/qasymm8.cpp", + "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp", + "src/cpu/kernels/activation/generic/neon/qsymm16.cpp", + "src/cpu/kernels/activation/generic/sve/fp16.cpp", + "src/cpu/kernels/activation/generic/sve/fp32.cpp", + "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp", + "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp", + "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp", "src/cpu/kernels/add/generic/neon/fp16.cpp", "src/cpu/kernels/add/generic/neon/fp32.cpp", "src/cpu/kernels/add/generic/neon/impl.cpp", diff --git a/filelist.json b/filelist.json index 428ad7d2cb..d7847480a4 100644 --- a/filelist.json +++ b/filelist.json @@ -845,20 +845,20 @@ "src/cpu/operators/CpuActivation.cpp", "src/cpu/kernels/CpuActivationKernel.cpp", "src/runtime/NEON/functions/NEActivationLayer.cpp", - "src/cpu/kernels/activation/neon/qasymm8.cpp", - "src/cpu/kernels/activation/neon/qasymm8_signed.cpp", - "src/cpu/kernels/activation/neon/qsymm16.cpp" + "src/cpu/kernels/activation/generic/neon/qasymm8.cpp", + "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp", + "src/cpu/kernels/activation/generic/neon/qsymm16.cpp" ], "neon": { - "fp16": [ "src/cpu/kernels/activation/neon/fp16.cpp" ], - "fp32": [ "src/cpu/kernels/activation/neon/fp32.cpp" ] + "fp16": [ "src/cpu/kernels/activation/generic/neon/fp16.cpp" ], + "fp32": [ "src/cpu/kernels/activation/generic/neon/fp32.cpp" ] }, "sve": { - "fp16": [ "src/cpu/kernels/activation/sve/fp16.cpp" ], - "fp32": [ "src/cpu/kernels/activation/sve/fp32.cpp" ], - "qasymm8": [ "src/cpu/kernels/activation/sve/qasymm8.cpp" ], - "qasymm8_signed": [ "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" ], - "qsymm16": [ "src/cpu/kernels/activation/sve/qsymm16.cpp" ] + "fp16": [ "src/cpu/kernels/activation/generic/sve/fp16.cpp" ], + "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ], + "qasymm8": [ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp" ] } } }, diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index aed73d1fec..3af379d8af 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -65,57 +65,57 @@ static const ActivationKernel available_kernels[] = { "sve_fp16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation) + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation) }, { "sve_fp32_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { "neon_fp16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation) + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation) }, { "neon_fp32_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { "sve_qu8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation) + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation) }, { "sve_qs8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation) + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation) }, { "sve_qs16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); }, - REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation) + REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ { "neon_qu8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation) + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation) }, { "neon_qs8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation) + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation) }, { "neon_qs16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation) }, }; @@ -233,18 +233,14 @@ Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo * size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count) const { ARM_COMPUTE_UNUSED(thread_count); - // Tuning results that gave optimized results in performance investigation - if (platform.get_cpu_model() == CPUModel::A73 ) + // Tuning results that gave optimized results in performance investigation + if(platform.get_cpu_model() == CPUModel::A73) { return 10240; } - else if (platform.get_cpu_model() == CPUModel::A76) - { - return 9216; - } else { - return ICPPKernel::default_mws; + return 9216; } } diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp new file mode 100644 index 0000000000..e51b5b3423 --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/activation/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 }; +} // namespace + +void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + fp_neon_activation_impl<float16_t, Fp16Params>(src, dst, act_info, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp new file mode 100644 index 0000000000..2a3b8a0bfd --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/activation/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 }; +} // namespace +void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + fp_neon_activation_impl<float, Fp32Params>(src, dst, act_info, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/impl.h index 54301d45ad..2dd239e3a1 100644 --- a/src/cpu/kernels/activation/neon/fp32.cpp +++ b/src/cpu/kernels/activation/generic/neon/impl.h @@ -22,72 +22,73 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" - -#include <arm_neon.h> -#include <cmath> -#include <cstddef> - namespace arm_compute { namespace cpu { -namespace +/** Constant parameters needed by the activation implementation. + * These parameters differ for each floating type + * + * @note This are passed as a struct as C++ does not allow float as a template parameter until C++20 + **/ +struct ActFpImplParams { + float delta; /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */ + int step_x; /**< Window step at the x dimension */ +}; + #ifndef __aarch64__ inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask) { auto int_in = vreinterpretq_u32_f32(in); return vreinterpretq_f32_u32(wrapper::vand(int_in, mask)); } +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask) +{ + auto int_in = vreinterpretq_u16_f16(in); + return vreinterpretq_f16_u16(wrapper::vand(int_in, mask)); +} +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #endif /* __aarch64__ */ -} // namespace -void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +template <typename T, const ActFpImplParams &P> +void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { /** SIMD vector tag type. */ - using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; - - constexpr int window_step_x = 4; + using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + constexpr int window_step_x = P.step_x; const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - // In case of non-aarch64, a small delta value is added to the input // to prevent NAN values caused by zeros in inputs to SQRT. // In case of aarh64, we call vsqrt directly, so we don't use delta. #ifndef __aarch64__ - const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {}); + const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {}); #endif /* __aarch64__ */ - const auto const_1 = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {}); - const auto const_0 = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{}); - const auto const_6 = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{}); - const auto const_3 = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{}); - const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{}); - + const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {}); + const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{}); + const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{}); + const auto const_inv_6 = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{}); constexpr float soft_relu_thresh = 12.f; - const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{}); - - const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{}); - const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{}); - const auto a = static_cast<float>(act_info.a()); - const auto b = static_cast<float>(act_info.b()); + const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{}); + const auto va = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}); + const auto a = static_cast<T>(act_info.a()); + const auto b = static_cast<T>(act_info.b()); execute_window_loop(win_collapsed, [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - - wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp; - + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; // Compute S elements per iteration int x = window_start_x; for(; x <= (window_end_x - window_step_x); x += window_step_x) @@ -150,12 +151,11 @@ void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLaye } wrapper::vstore(output_ptr + x, tmp); } - // Compute left-over elements for(; x < window_end_x; ++x) { - const float in = *(reinterpret_cast<const float *>(input_ptr + x)); - float tmp; + const T in = *(reinterpret_cast<const T *>(input_ptr + x)); + T tmp; switch(act) { case ActivationLayerInfo::ActivationFunction::ABS: @@ -165,22 +165,22 @@ void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLaye tmp = a * in + b; break; case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in)); + tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); break; case ActivationLayerInfo::ActivationFunction::RELU: - tmp = std::max<float>(static_cast<float>(0), in); + tmp = std::max<T>(static_cast<T>(0), in); break; case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = std::min<float>(a, std::max(static_cast<float>(0), in)); + tmp = std::min<T>(a, std::max(static_cast<T>(0), in)); break; case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min<float>(a, std::max<float>(b, in)); + tmp = std::min<T>(a, std::max<T>(b, in)); break; case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: tmp = (in > 0) ? in : a * in; break; case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in)); + tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in)); break; case ActivationLayerInfo::ActivationFunction::ELU: tmp = (in >= 0) ? in : a * (std::exp(in) - 1); diff --git a/src/cpu/kernels/activation/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp index a1217435b6..62e329e691 100644 --- a/src/cpu/kernels/activation/neon/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp @@ -36,7 +36,7 @@ namespace arm_compute { namespace cpu { -void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { constexpr int window_step_x = 16; const auto window_start_x = static_cast<int>(window.x().start()); diff --git a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp index 8b40bf8e72..4dca1ba794 100644 --- a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp @@ -35,7 +35,7 @@ namespace arm_compute { namespace cpu { -void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { constexpr int window_step_x = 16; const auto window_start_x = static_cast<int>(window.x().start()); diff --git a/src/cpu/kernels/activation/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp index 54b41820f2..865b9f114e 100644 --- a/src/cpu/kernels/activation/neon/qsymm16.cpp +++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp @@ -37,7 +37,7 @@ namespace arm_compute { namespace cpu { -void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { constexpr int window_step_x = 8; const auto window_start_x = static_cast<int>(window.x().start()); diff --git a/src/cpu/kernels/activation/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp index 5e76e82c52..47d9fabb55 100644 --- a/src/cpu/kernels/activation/sve/fp16.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp @@ -36,7 +36,7 @@ namespace arm_compute { namespace cpu { -void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); diff --git a/src/cpu/kernels/activation/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp index cb9f82eb39..1685b0f669 100644 --- a/src/cpu/kernels/activation/sve/fp32.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp @@ -36,7 +36,7 @@ namespace arm_compute { namespace cpu { -void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); diff --git a/src/cpu/kernels/activation/sve/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp index 69fffd96c5..3b99c0f120 100644 --- a/src/cpu/kernels/activation/sve/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp @@ -36,7 +36,7 @@ namespace arm_compute { namespace cpu { -void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); diff --git a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp index 53ee515ff9..24415145d3 100644 --- a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp @@ -37,7 +37,7 @@ namespace arm_compute { namespace cpu { -void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); diff --git a/src/cpu/kernels/activation/sve/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp index ac549770a2..0eecfa618f 100644 --- a/src/cpu/kernels/activation/sve/qsymm16.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp @@ -38,7 +38,7 @@ namespace arm_compute { namespace cpu { -void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h index 409d025db0..bf9aa0f373 100644 --- a/src/cpu/kernels/activation/list.h +++ b/src/cpu/kernels/activation/list.h @@ -31,16 +31,16 @@ namespace cpu #define DECLARE_ACTIVATION_KERNEL(func_name) \ void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation); -DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation); -DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation); -DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation); -DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation); -DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation); -DECLARE_ACTIVATION_KERNEL(fp16_neon_activation); -DECLARE_ACTIVATION_KERNEL(fp16_sve_activation); -DECLARE_ACTIVATION_KERNEL(fp32_neon_activation); -DECLARE_ACTIVATION_KERNEL(fp32_sve_activation); +DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation); +DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation); +DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation); +DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation); +DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation); +DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation); +DECLARE_ACTIVATION_KERNEL(sve_fp16_activation); +DECLARE_ACTIVATION_KERNEL(sve_fp32_activation); +DECLARE_ACTIVATION_KERNEL(neon_fp16_activation); +DECLARE_ACTIVATION_KERNEL(neon_fp32_activation); #undef DECLARE_ACTIVATION_KERNEL } // namespace cpu diff --git a/src/cpu/kernels/activation/neon/fp16.cpp b/src/cpu/kernels/activation/neon/fp16.cpp deleted file mode 100644 index 6f2d5d8533..0000000000 --- a/src/cpu/kernels/activation/neon/fp16.cpp +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/NEMath.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include <arm_neon.h> -#include <cmath> -#include <cstddef> - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -#ifndef __aarch64__ -inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask) -{ - auto int_in = vreinterpretq_u16_f16(in); - return vreinterpretq_f16_u16(wrapper::vand(int_in, mask)); -} -#endif /* __aarch64__ */ -} // namespace - -void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>; - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - constexpr int window_step_x = 8; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - // In case of non-aarch64, a small delta value is added to the input - // to prevent NAN values caused by zeros in inputs to SQRT. - // In case of aarh64, we call vsqrt directly, so we don't use delta. -#ifndef __aarch64__ - const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {})); -#endif /* __aarch64__ */ - - const auto const_1 = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{}); - const auto const_0 = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{}); - const auto const_6 = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{}); - const auto const_3 = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{}); - const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{}); - - constexpr float soft_relu_thresh = 12.f; - const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{}); - - const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{}); - const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{}); - const auto a = static_cast<float16_t>(act_info.a()); - const auto b = static_cast<float16_t>(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - - wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = wrapper::vabs(vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = wrapper::vmla(vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = wrapper::vmax(const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: -#ifdef __aarch64__ - tmp = wrapper::vsqrt(vin); -#else /* __aarch64__ */ - { - const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{})); - tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); - tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); - } -#endif /* __aarch64__ */ - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = wrapper::vmul(vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x)); - float16_t tmp; - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = std::abs(in); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = a * in + b; - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in)); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = std::max<float16_t>(static_cast<float16_t>(0), in); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min<float16_t>(a, std::max<float16_t>(b, in)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = (in > 0) ? in : a * in; - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in)); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = (in >= 0) ? in : a * (std::exp(in) - 1); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = std::sqrt(in); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = in * in; - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = a * std::tanh(b * in); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = in; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ |