COMPMID-1499: Fixed issues to build for FP16 on Android

Change-Id: I7cd15e9115b5c6f544005528d69061751286be11 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/143708 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
author: Anthony Barbier <anthony.barbier@arm.com> 2018-08-10 17:36:36 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:54:54 +0000
commit: 3a6163ed0c2d0ab4cac0456e8f66c704c6ad10c2 (patch)
tree: f2e34b47fb0b8bf1fac18c337a67c0145def7b7b
parent: e3f1bd10e71075f7d8b406698809666e1eb4e7ae (diff)
download: ComputeLibrary-3a6163ed0c2d0ab4cac0456e8f66c704c6ad10c2.tar.gz
6 files changed, 51 insertions, 35 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 84154020a5..61d25d115c 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -173,35 +173,7 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
-const std::array<float16x8_t, 8> exp_tab_f16 =
-{
-    {
-        vdupq_n_f16(1.f),
-        vdupq_n_f16(0.0416598916054f),
-        vdupq_n_f16(0.500000596046f),
-        vdupq_n_f16(0.0014122662833f),
-        vdupq_n_f16(1.00000011921f),
-        vdupq_n_f16(0.00833693705499f),
-        vdupq_n_f16(0.166665703058f),
-        vdupq_n_f16(0.000195780929062f),
-    }
-};
-
 /** Logarithm polynomial coefficients */
-const std::array<float16x8_t, 8> log_tab_f16 =
-{
-    {
-        vdupq_n_f16(-2.29561495781f),
-        vdupq_n_f16(-2.47071170807f),
-        vdupq_n_f16(-5.68692588806f),
-        vdupq_n_f16(-0.165253549814f),
-        vdupq_n_f16(5.17591238022f),
-        vdupq_n_f16(0.844007015228f),
-        vdupq_n_f16(4.58445882797f),
-        vdupq_n_f16(0.0141278216615f),
-    }
-};
-
 #ifndef DOXYGEN_SKIP_THIS
 inline float16x4_t vinvsqrt_f16(float16x4_t x)
 {
@@ -264,6 +236,20 @@ inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t
 
 inline float16x8_t vexpq_f16(float16x8_t x)
 {
+    static const std::array<float16x8_t, 8> exp_tab_f16 =
+    {
+        {
+            vdupq_n_f16(1.f),
+            vdupq_n_f16(0.0416598916054f),
+            vdupq_n_f16(0.500000596046f),
+            vdupq_n_f16(0.0014122662833f),
+            vdupq_n_f16(1.00000011921f),
+            vdupq_n_f16(0.00833693705499f),
+            vdupq_n_f16(0.166665703058f),
+            vdupq_n_f16(0.000195780929062f),
+        }
+    };
+
     static const float16x8_t CONST_LN2          = vdupq_n_f16(0.6931471805f); // ln(2)
     static const float16x8_t CONST_INV_LN2      = vdupq_n_f16(1.4426950408f); // 1/ln(2)
     static const float16x8_t CONST_0            = vdupq_n_f16(0.f);
@@ -285,6 +271,20 @@ inline float16x8_t vexpq_f16(float16x8_t x)
 
 inline float16x8_t vlogq_f16(float16x8_t x)
 {
+    static const std::array<float16x8_t, 8> log_tab_f16 =
+    {
+        {
+            vdupq_n_f16(-2.29561495781f),
+            vdupq_n_f16(-2.47071170807f),
+            vdupq_n_f16(-5.68692588806f),
+            vdupq_n_f16(-0.165253549814f),
+            vdupq_n_f16(5.17591238022f),
+            vdupq_n_f16(0.844007015228f),
+            vdupq_n_f16(4.58445882797f),
+            vdupq_n_f16(0.0141278216615f),
+        }
+    };
+
     static const int16x8_t   CONST_127 = vdupq_n_s16(127);           // 127
     static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2)
 
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index f2ad539aeb..e221f7dc00 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -849,6 +849,7 @@ Below is a list of the common parameters among the graph examples :
 For Android, the library was successfully built and tested using Google's standalone toolchains:
  - clang++ from NDK r17b for armv7a
  - clang++ from NDK r17b for arm64-v8a
+ - clang++ from NDK r18-beta1 for arm64-v8.2-a with FP16 support
 
 Here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>
 
diff --git a/opencl-1.2-stubs/opencl_stubs.c b/opencl-1.2-stubs/opencl_stubs.c
index a76eaa0bf9..3bfe3a0016 100755
--- a/opencl-1.2-stubs/opencl_stubs.c
+++ b/opencl-1.2-stubs/opencl_stubs.c
@@ -1,3 +1,4 @@
+#define CL_TARGET_OPENCL_VERSION 200
 #include <CL/cl.h>
 #include <stdio.h>
 
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 14fa1b492f..5e1c216b65 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,10 +41,6 @@ using namespace arm_compute;
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-template class arm_compute::NEHarrisScoreFP16Kernel<3>;
-template class arm_compute::NEHarrisScoreFP16Kernel<5>;
-template class arm_compute::NEHarrisScoreFP16Kernel<7>;
-
 namespace fp16
 {
 inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
@@ -361,6 +357,10 @@ void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const
     INEKernel::configure(win);
 }
 
+template class arm_compute::NEHarrisScoreFP16Kernel<3>;
+template class arm_compute::NEHarrisScoreFP16Kernel<5>;
+template class arm_compute::NEHarrisScoreFP16Kernel<7>;
+
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template class arm_compute::NEHarrisScoreKernel<3>;
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 4041b623b1..484e58b79b 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -392,7 +392,7 @@ void logits_1d_max(const ITensor &in, ITensor &out, const Window &window)
         const auto out_ptr = reinterpret_cast<T *>(output.ptr());
 
         // Init max value
-        auto vec_max = vdup_n<vec_16_byte_t<T>>(std::numeric_limits<T>::lowest());
+        auto vec_max = vdup_n<vec_16_byte_t<T>>(support::cpp11::lowest<T>());
 
         // Loop over input row
         for(const T *it = in_ptr; it < (in_ptr + input_width); it += vec_size_of(vec_max))
@@ -694,7 +694,7 @@ void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const
             {
                 auto vec_elements = vld<vec_16_byte_t<T>>(in_ptr + i);
                 vec_elements      = vsub(vec_elements, vec_max);
-                vec_elements      = vexp(vmul_n(vec_elements, beta));
+                vec_elements      = vexp(vmul_n(vec_elements, static_cast<T>(beta)));
                 vec_sum           = vadd(vec_sum, vec_elements);
                 vst(tmp_ptr + i, vec_elements);
             }
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index ece966704f..7d02e67ec6 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -315,6 +315,20 @@ inline void *align(std::size_t alignment, std::size_t size, void *&ptr, std::siz
 
     return ptr = reinterpret_cast<void *>(aligned);
 }
+// std::numeric_limits<T>::lowest
+template <typename T>
+inline T lowest()
+{
+    return std::numeric_limits<T>::lowest();
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline __fp16 lowest<__fp16>()
+{
+    return std::numeric_limits<half_float::half>::lowest();
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 // std::isfinite
 template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
author	Anthony Barbier <anthony.barbier@arm.com>	2018-08-10 17:36:36 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:54:54 +0000
commit	3a6163ed0c2d0ab4cac0456e8f66c704c6ad10c2 (patch)
tree	f2e34b47fb0b8bf1fac18c337a67c0145def7b7b
parent	e3f1bd10e71075f7d8b406698809666e1eb4e7ae (diff)
download	ComputeLibrary-3a6163ed0c2d0ab4cac0456e8f66c704c6ad10c2.tar.gz