aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDana Zlotnik <dana.zlotnik@arm.com>2021-11-25 09:58:27 +0200
committerDana Zlotnik <dana.zlotnik@arm.com>2021-12-20 12:56:22 +0000
commit3229171402dcb9a63d63380865ba18477b81ff89 (patch)
treebc9da509a853307184c4d55c7874376adfdcf13d
parent4d44ac8685662984386b65869c3ed6af1144a419 (diff)
downloadComputeLibrary-3229171402dcb9a63d63380865ba18477b81ff89.tar.gz
Decouple CpuActivationKernel
1- Data types were already decoupled. This commit arrange the folder struct of the activation kernel. 2- Refactor NEON CpuActivationKernel for floating-point cases. Resolves COMPMID-4636 Change-Id: Ia4527244c84260dce1dd1d4bd4a9e3cfe2486d85 Signed-off-by: Dana Zlotnik <dana.zlotnik@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6739 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
-rw-r--r--Android.bp20
-rw-r--r--filelist.json20
-rw-r--r--src/cpu/kernels/CpuActivationKernel.cpp30
-rw-r--r--src/cpu/kernels/activation/generic/neon/fp16.cpp43
-rw-r--r--src/cpu/kernels/activation/generic/neon/fp32.cpp39
-rw-r--r--src/cpu/kernels/activation/generic/neon/impl.h (renamed from src/cpu/kernels/activation/neon/fp32.cpp)86
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8.cpp (renamed from src/cpu/kernels/activation/neon/qasymm8.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp (renamed from src/cpu/kernels/activation/neon/qasymm8_signed.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/neon/qsymm16.cpp (renamed from src/cpu/kernels/activation/neon/qsymm16.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp16.cpp (renamed from src/cpu/kernels/activation/sve/fp16.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp32.cpp (renamed from src/cpu/kernels/activation/sve/fp32.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8.cpp (renamed from src/cpu/kernels/activation/sve/qasymm8.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp (renamed from src/cpu/kernels/activation/sve/qasymm8_signed.cpp)2
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qsymm16.cpp (renamed from src/cpu/kernels/activation/sve/qsymm16.cpp)2
-rw-r--r--src/cpu/kernels/activation/list.h20
-rw-r--r--src/cpu/kernels/activation/neon/fp16.cpp217
16 files changed, 176 insertions, 315 deletions
diff --git a/Android.bp b/Android.bp
index 5727706c07..5654b840fe 100644
--- a/Android.bp
+++ b/Android.bp
@@ -429,16 +429,16 @@ cc_library_static {
"src/cpu/kernels/CpuTransposeKernel.cpp",
"src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
"src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
- "src/cpu/kernels/activation/neon/fp16.cpp",
- "src/cpu/kernels/activation/neon/fp32.cpp",
- "src/cpu/kernels/activation/neon/qasymm8.cpp",
- "src/cpu/kernels/activation/neon/qasymm8_signed.cpp",
- "src/cpu/kernels/activation/neon/qsymm16.cpp",
- "src/cpu/kernels/activation/sve/fp16.cpp",
- "src/cpu/kernels/activation/sve/fp32.cpp",
- "src/cpu/kernels/activation/sve/qasymm8.cpp",
- "src/cpu/kernels/activation/sve/qasymm8_signed.cpp",
- "src/cpu/kernels/activation/sve/qsymm16.cpp",
+ "src/cpu/kernels/activation/generic/neon/fp16.cpp",
+ "src/cpu/kernels/activation/generic/neon/fp32.cpp",
+ "src/cpu/kernels/activation/generic/neon/qasymm8.cpp",
+ "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp",
+ "src/cpu/kernels/activation/generic/neon/qsymm16.cpp",
+ "src/cpu/kernels/activation/generic/sve/fp16.cpp",
+ "src/cpu/kernels/activation/generic/sve/fp32.cpp",
+ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp",
+ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp",
+ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp",
"src/cpu/kernels/add/generic/neon/fp16.cpp",
"src/cpu/kernels/add/generic/neon/fp32.cpp",
"src/cpu/kernels/add/generic/neon/impl.cpp",
diff --git a/filelist.json b/filelist.json
index 428ad7d2cb..d7847480a4 100644
--- a/filelist.json
+++ b/filelist.json
@@ -845,20 +845,20 @@
"src/cpu/operators/CpuActivation.cpp",
"src/cpu/kernels/CpuActivationKernel.cpp",
"src/runtime/NEON/functions/NEActivationLayer.cpp",
- "src/cpu/kernels/activation/neon/qasymm8.cpp",
- "src/cpu/kernels/activation/neon/qasymm8_signed.cpp",
- "src/cpu/kernels/activation/neon/qsymm16.cpp"
+ "src/cpu/kernels/activation/generic/neon/qasymm8.cpp",
+ "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp",
+ "src/cpu/kernels/activation/generic/neon/qsymm16.cpp"
],
"neon": {
- "fp16": [ "src/cpu/kernels/activation/neon/fp16.cpp" ],
- "fp32": [ "src/cpu/kernels/activation/neon/fp32.cpp" ]
+ "fp16": [ "src/cpu/kernels/activation/generic/neon/fp16.cpp" ],
+ "fp32": [ "src/cpu/kernels/activation/generic/neon/fp32.cpp" ]
},
"sve": {
- "fp16": [ "src/cpu/kernels/activation/sve/fp16.cpp" ],
- "fp32": [ "src/cpu/kernels/activation/sve/fp32.cpp" ],
- "qasymm8": [ "src/cpu/kernels/activation/sve/qasymm8.cpp" ],
- "qasymm8_signed": [ "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" ],
- "qsymm16": [ "src/cpu/kernels/activation/sve/qsymm16.cpp" ]
+ "fp16": [ "src/cpu/kernels/activation/generic/sve/fp16.cpp" ],
+ "fp32": [ "src/cpu/kernels/activation/generic/sve/fp32.cpp" ],
+ "qasymm8": [ "src/cpu/kernels/activation/generic/sve2/qasymm8.cpp" ],
+ "qasymm8_signed": [ "src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp" ],
+ "qsymm16": [ "src/cpu/kernels/activation/generic/sve2/qsymm16.cpp" ]
}
}
},
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index aed73d1fec..3af379d8af 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -65,57 +65,57 @@ static const ActivationKernel available_kernels[] =
{
"sve_fp16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
- REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
+ REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
},
{
"sve_fp32_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
- REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
+ REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
"neon_fp16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)
},
{
"neon_fp32_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
"sve_qu8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
+ REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
},
{
"sve_qs8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
+ REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
},
{
"sve_qs16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); },
- REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
+ REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
{
"neon_qu8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)
},
{
"neon_qs8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)
},
{
"neon_qs16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)
},
};
@@ -233,18 +233,14 @@ Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *
size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
{
ARM_COMPUTE_UNUSED(thread_count);
- // Tuning results that gave optimized results in performance investigation
- if (platform.get_cpu_model() == CPUModel::A73 )
+ // Tuning results that gave optimized results in performance investigation
+ if(platform.get_cpu_model() == CPUModel::A73)
{
return 10240;
}
- else if (platform.get_cpu_model() == CPUModel::A76)
- {
- return 9216;
- }
else
{
- return ICPPKernel::default_mws;
+ return 9216;
}
}
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
new file mode 100644
index 0000000000..e51b5b3423
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 };
+} // namespace
+
+void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ fp_neon_activation_impl<float16_t, Fp16Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
new file mode 100644
index 0000000000..2a3b8a0bfd
--- /dev/null
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/activation/generic/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 };
+} // namespace
+void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+ fp_neon_activation_impl<float, Fp32Params>(src, dst, act_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/impl.h
index 54301d45ad..2dd239e3a1 100644
--- a/src/cpu/kernels/activation/neon/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -22,72 +22,73 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
namespace arm_compute
{
namespace cpu
{
-namespace
+/** Constant parameters needed by the activation implementation.
+ * These parameters differ for each floating type
+ *
+ * @note This are passed as a struct as C++ does not allow float as a template parameter until C++20
+ **/
+struct ActFpImplParams
{
+ float delta; /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */
+ int step_x; /**< Window step at the x dimension */
+};
+
#ifndef __aarch64__
inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
{
auto int_in = vreinterpretq_u32_f32(in);
return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
}
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+ auto int_in = vreinterpretq_u16_f16(in);
+ return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#endif /* __aarch64__ */
-} // namespace
-void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+template <typename T, const ActFpImplParams &P>
+void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
/** SIMD vector tag type. */
- using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
-
- constexpr int window_step_x = 4;
+ using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+ constexpr int window_step_x = P.step_x;
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
-
// In case of non-aarch64, a small delta value is added to the input
// to prevent NAN values caused by zeros in inputs to SQRT.
// In case of aarh64, we call vsqrt directly, so we don't use delta.
#ifndef __aarch64__
- const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
+ const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {});
#endif /* __aarch64__ */
- const auto const_1 = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
- const auto const_0 = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
- const auto const_6 = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
- const auto const_3 = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
- const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
-
+ const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
+ const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
+ const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
+ const auto const_inv_6 = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
constexpr float soft_relu_thresh = 12.f;
- const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
-
- const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
- const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
- const auto a = static_cast<float>(act_info.a());
- const auto b = static_cast<float>(act_info.b());
+ const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{});
+ const auto va = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{});
+ const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
+ const auto a = static_cast<T>(act_info.a());
+ const auto b = static_cast<T>(act_info.b());
execute_window_loop(win_collapsed, [&](const Coordinates &)
{
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
-
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
// Compute S elements per iteration
int x = window_start_x;
for(; x <= (window_end_x - window_step_x); x += window_step_x)
@@ -150,12 +151,11 @@ void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLaye
}
wrapper::vstore(output_ptr + x, tmp);
}
-
// Compute left-over elements
for(; x < window_end_x; ++x)
{
- const float in = *(reinterpret_cast<const float *>(input_ptr + x));
- float tmp;
+ const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+ T tmp;
switch(act)
{
case ActivationLayerInfo::ActivationFunction::ABS:
@@ -165,22 +165,22 @@ void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLaye
tmp = a * in + b;
break;
case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
+ tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
break;
case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = std::max<float>(static_cast<float>(0), in);
+ tmp = std::max<T>(static_cast<T>(0), in);
break;
case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
+ tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
break;
case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = std::min<float>(a, std::max<float>(b, in));
+ tmp = std::min<T>(a, std::max<T>(b, in));
break;
case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
tmp = (in > 0) ? in : a * in;
break;
case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
+ tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
break;
case ActivationLayerInfo::ActivationFunction::ELU:
tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
diff --git a/src/cpu/kernels/activation/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index a1217435b6..62e329e691 100644
--- a/src/cpu/kernels/activation/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
{
namespace cpu
{
-void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
constexpr int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
diff --git a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index 8b40bf8e72..4dca1ba794 100644
--- a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -35,7 +35,7 @@ namespace arm_compute
{
namespace cpu
{
-void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
constexpr int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
diff --git a/src/cpu/kernels/activation/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
index 54b41820f2..865b9f114e 100644
--- a/src/cpu/kernels/activation/neon/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -37,7 +37,7 @@ namespace arm_compute
{
namespace cpu
{
-void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
constexpr int window_step_x = 8;
const auto window_start_x = static_cast<int>(window.x().start());
diff --git a/src/cpu/kernels/activation/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 5e76e82c52..47d9fabb55 100644
--- a/src/cpu/kernels/activation/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
{
namespace cpu
{
-void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
index cb9f82eb39..1685b0f669 100644
--- a/src/cpu/kernels/activation/sve/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
{
namespace cpu
{
-void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index 69fffd96c5..3b99c0f120 100644
--- a/src/cpu/kernels/activation/sve/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -36,7 +36,7 @@ namespace arm_compute
{
namespace cpu
{
-void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index 53ee515ff9..24415145d3 100644
--- a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -37,7 +37,7 @@ namespace arm_compute
{
namespace cpu
{
-void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/sve/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
index ac549770a2..0eecfa618f 100644
--- a/src/cpu/kernels/activation/sve/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -38,7 +38,7 @@ namespace arm_compute
{
namespace cpu
{
-void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h
index 409d025db0..bf9aa0f373 100644
--- a/src/cpu/kernels/activation/list.h
+++ b/src/cpu/kernels/activation/list.h
@@ -31,16 +31,16 @@ namespace cpu
#define DECLARE_ACTIVATION_KERNEL(func_name) \
void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation);
+DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(sve_fp32_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp16_activation);
+DECLARE_ACTIVATION_KERNEL(neon_fp32_activation);
#undef DECLARE_ACTIVATION_KERNEL
} // namespace cpu
diff --git a/src/cpu/kernels/activation/neon/fp16.cpp b/src/cpu/kernels/activation/neon/fp16.cpp
deleted file mode 100644
index 6f2d5d8533..0000000000
--- a/src/cpu/kernels/activation/neon/fp16.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/NEMath.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
-{
- auto int_in = vreinterpretq_u16_f16(in);
- return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- constexpr int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- // In case of non-aarch64, a small delta value is added to the input
- // to prevent NAN values caused by zeros in inputs to SQRT.
- // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
- const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
-#endif /* __aarch64__ */
-
- const auto const_1 = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
- const auto const_0 = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
- const auto const_6 = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
- const auto const_3 = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
- const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
-
- constexpr float soft_relu_thresh = 12.f;
- const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
-
- const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
- const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
- const auto a = static_cast<float16_t>(act_info.a());
- const auto b = static_cast<float16_t>(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = wrapper::vabs(vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = wrapper::vmla(vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = wrapper::vmax(const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
- tmp = wrapper::vsqrt(vin);
-#else /* __aarch64__ */
- {
- const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
- tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
- tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
- }
-#endif /* __aarch64__ */
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = wrapper::vmul(vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
- float16_t tmp;
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = std::abs(in);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = a * in + b;
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = (in > 0) ? in : a * in;
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = std::sqrt(in);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = in * in;
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = a * std::tanh(b * in);
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = in;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */