diff options
-rw-r--r-- | arm_compute/core/NEON/NEMath.inl | 56 | ||||
-rw-r--r-- | docs/00_introduction.dox | 1 | ||||
-rwxr-xr-x | opencl-1.2-stubs/opencl_stubs.c | 1 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEHarrisCornersKernel.cpp | 10 | ||||
-rw-r--r-- | src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 4 | ||||
-rw-r--r-- | support/ToolchainSupport.h | 14 |
6 files changed, 51 insertions, 35 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 84154020a5..61d25d115c 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -173,35 +173,7 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Exponent polynomial coefficients */ -const std::array<float16x8_t, 8> exp_tab_f16 = -{ - { - vdupq_n_f16(1.f), - vdupq_n_f16(0.0416598916054f), - vdupq_n_f16(0.500000596046f), - vdupq_n_f16(0.0014122662833f), - vdupq_n_f16(1.00000011921f), - vdupq_n_f16(0.00833693705499f), - vdupq_n_f16(0.166665703058f), - vdupq_n_f16(0.000195780929062f), - } -}; - /** Logarithm polynomial coefficients */ -const std::array<float16x8_t, 8> log_tab_f16 = -{ - { - vdupq_n_f16(-2.29561495781f), - vdupq_n_f16(-2.47071170807f), - vdupq_n_f16(-5.68692588806f), - vdupq_n_f16(-0.165253549814f), - vdupq_n_f16(5.17591238022f), - vdupq_n_f16(0.844007015228f), - vdupq_n_f16(4.58445882797f), - vdupq_n_f16(0.0141278216615f), - } -}; - #ifndef DOXYGEN_SKIP_THIS inline float16x4_t vinvsqrt_f16(float16x4_t x) { @@ -264,6 +236,20 @@ inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t inline float16x8_t vexpq_f16(float16x8_t x) { + static const std::array<float16x8_t, 8> exp_tab_f16 = + { + { + vdupq_n_f16(1.f), + vdupq_n_f16(0.0416598916054f), + vdupq_n_f16(0.500000596046f), + vdupq_n_f16(0.0014122662833f), + vdupq_n_f16(1.00000011921f), + vdupq_n_f16(0.00833693705499f), + vdupq_n_f16(0.166665703058f), + vdupq_n_f16(0.000195780929062f), + } + }; + static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2) static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); // 1/ln(2) static const float16x8_t CONST_0 = vdupq_n_f16(0.f); @@ -285,6 +271,20 @@ inline float16x8_t vexpq_f16(float16x8_t x) inline float16x8_t vlogq_f16(float16x8_t x) { + static const std::array<float16x8_t, 8> log_tab_f16 = + { + { + vdupq_n_f16(-2.29561495781f), + vdupq_n_f16(-2.47071170807f), + vdupq_n_f16(-5.68692588806f), + vdupq_n_f16(-0.165253549814f), + vdupq_n_f16(5.17591238022f), + vdupq_n_f16(0.844007015228f), + vdupq_n_f16(4.58445882797f), + vdupq_n_f16(0.0141278216615f), + } + }; + static const int16x8_t CONST_127 = vdupq_n_s16(127); // 127 static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2) diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index f2ad539aeb..e221f7dc00 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -849,6 +849,7 @@ Below is a list of the common parameters among the graph examples : For Android, the library was successfully built and tested using Google's standalone toolchains: - clang++ from NDK r17b for armv7a - clang++ from NDK r17b for arm64-v8a + - clang++ from NDK r18-beta1 for arm64-v8.2-a with FP16 support Here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a> diff --git a/opencl-1.2-stubs/opencl_stubs.c b/opencl-1.2-stubs/opencl_stubs.c index a76eaa0bf9..3bfe3a0016 100755 --- a/opencl-1.2-stubs/opencl_stubs.c +++ b/opencl-1.2-stubs/opencl_stubs.c @@ -1,3 +1,4 @@ +#define CL_TARGET_OPENCL_VERSION 200 #include <CL/cl.h> #include <stdio.h> diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp index 14fa1b492f..5e1c216b65 100644 --- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp +++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,10 +41,6 @@ using namespace arm_compute; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class arm_compute::NEHarrisScoreFP16Kernel<3>; -template class arm_compute::NEHarrisScoreFP16Kernel<5>; -template class arm_compute::NEHarrisScoreFP16Kernel<7>; - namespace fp16 { inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh) @@ -361,6 +357,10 @@ void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const INEKernel::configure(win); } +template class arm_compute::NEHarrisScoreFP16Kernel<3>; +template class arm_compute::NEHarrisScoreFP16Kernel<5>; +template class arm_compute::NEHarrisScoreFP16Kernel<7>; + #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template class arm_compute::NEHarrisScoreKernel<3>; diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index 4041b623b1..484e58b79b 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -392,7 +392,7 @@ void logits_1d_max(const ITensor &in, ITensor &out, const Window &window) const auto out_ptr = reinterpret_cast<T *>(output.ptr()); // Init max value - auto vec_max = vdup_n<vec_16_byte_t<T>>(std::numeric_limits<T>::lowest()); + auto vec_max = vdup_n<vec_16_byte_t<T>>(support::cpp11::lowest<T>()); // Loop over input row for(const T *it = in_ptr; it < (in_ptr + input_width); it += vec_size_of(vec_max)) @@ -694,7 +694,7 @@ void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const { auto vec_elements = vld<vec_16_byte_t<T>>(in_ptr + i); vec_elements = vsub(vec_elements, vec_max); - vec_elements = vexp(vmul_n(vec_elements, beta)); + vec_elements = vexp(vmul_n(vec_elements, static_cast<T>(beta))); vec_sum = vadd(vec_sum, vec_elements); vst(tmp_ptr + i, vec_elements); } diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h index ece966704f..7d02e67ec6 100644 --- a/support/ToolchainSupport.h +++ b/support/ToolchainSupport.h @@ -315,6 +315,20 @@ inline void *align(std::size_t alignment, std::size_t size, void *&ptr, std::siz return ptr = reinterpret_cast<void *>(aligned); } +// std::numeric_limits<T>::lowest +template <typename T> +inline T lowest() +{ + return std::numeric_limits<T>::lowest(); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline __fp16 lowest<__fp16>() +{ + return std::numeric_limits<half_float::half>::lowest(); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ // std::isfinite template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type> |