From f2cdce30ca782cac7caebc43c7e67caf677b7358 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 9 Dec 2019 18:35:57 +0000
Subject: COMPMID-2841: Enable aarch32 builds

Decouples the execution state from the architecture.
Now architectures can be set as (armv7a, armv8a, etc) and execution
state using the `estate` flag with the following options (auto, 32, 64).

Change-Id: Ie7f757b3565495a39c7e20fb350a72fd9c5a2a4f
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2438
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 SConscript                                         |  5 +-
 SConstruct                                         | 90 +++++++++++++---------
 arm_compute/core/NEON/NEMath.inl                   |  4 +-
 docs/Doxyfile                                      |  4 +-
 src/core/NEON/kernels/NESelectKernel.cpp           |  4 +-
 .../arm_gemm/merges/a32_merge_float_8x6.hpp        |  4 +
 6 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/SConscript b/SConscript
index ed22f6eefe..0b7729cced 100644
--- a/SConscript
+++ b/SConscript
@@ -216,11 +216,10 @@ if env['neon']:
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
-    if env['arch'] == "armv7a":
+    if env['estate'] == '32':
         core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a32_*/*.cpp')
 
-
-    if "arm64-v8" in env['arch']:
+    if env['estate'] == '64':
         core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp')
         if "sve" in env['arch']:
              core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
diff --git a/SConstruct b/SConstruct
index e63d33e9fd..216920f059 100644
--- a/SConstruct
+++ b/SConstruct
@@ -40,7 +40,9 @@ vars.AddVariables(
     BoolVariable("debug", "Debug", False),
     BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
     BoolVariable("logging", "Logging (this flag is forced to 1 for debug=1)", False),
-    EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "x86_32", "x86_64")),
+    EnumVariable("arch", "Target Architecture", "armv7a",
+                  allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "x86_32", "x86_64", "armv8a", "armv8.2-a", "armv8.2-a-sve", "x86")),
+    EnumVariable("estate", "Execution State", "auto", allowed_values=("auto", "32", "64")),
     EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
     EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile", "embed_only")),
     BoolVariable("examples", "Build example programs", True),
@@ -164,48 +166,66 @@ if env['openmp']:
     env.Append(CXXFLAGS = ['-fopenmp'])
     env.Append(LINKFLAGS = ['-fopenmp'])
 
+# Validate and define state
+if env['estate'] == 'auto':
+    if 'v7a' in env['arch']:
+        env['estate'] = '32'
+    else:
+        env['estate'] = '64'
+
+# Map legacy arch
+if 'arm64' in env['arch']:
+    env['estate'] = '64'
+
+if 'v7a' in env['estate'] and env['estate'] == '64':
+    print("ERROR: armv7a architecture has only 32-bit execution state")
+    Exit(1)
+
 # Add architecture specific flags
 prefix = ""
-if env['arch'] == 'armv7a':
+if 'v7a' in env['arch']:
     env.Append(CXXFLAGS = ['-march=armv7-a', '-mthumb', '-mfpu=neon'])
-
-    if env['os'] == 'linux':
-        prefix = "arm-linux-gnueabihf-"
-        env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
-    elif env['os'] == 'bare_metal':
-        prefix = "arm-eabi-"
-        env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
-    elif env['os'] == 'android':
-        prefix = "arm-linux-androideabi-"
+    if env['os'] == 'android':
         env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
-elif env['arch'] == 'arm64-v8a':
+    else:
+        env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
+elif 'v8a' in env['arch']:
     env.Append(CXXFLAGS = ['-march=armv8-a'])
-    env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8A'])
-    if env['os'] == 'linux':
-        prefix = "aarch64-linux-gnu-"
-    elif env['os'] == 'bare_metal':
-        prefix = "aarch64-elf-"
-    elif env['os'] == 'android':
-        prefix = "aarch64-linux-android-"
-elif 'arm64-v8.2-a' in env['arch']:
-    if env['arch'] == 'arm64-v8.2-a-sve':
+    if env['estate'] == '32':
+        env.Append(CXXFLAGS = ['-mfpu=neon-fp-armv8'])
+elif 'v8.2-a' in env['arch']:
+    if env['estate'] == '32':
+        env.Append(CXXFLAGS = ['-mfpu=neon-fp-armv8'])
+    if 'sve' in env['arch']:
         env.Append(CXXFLAGS = ['-march=armv8.2-a+sve+fp16+dotprod'])
     else:
         env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
-    if env['os'] == 'linux':
-        prefix = "aarch64-linux-gnu-"
-    elif env['os'] == 'bare_metal':
-        prefix = "aarch64-elf-"
-    elif env['os'] == 'android':
-        prefix = "aarch64-linux-android-"
-    env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2'])
-elif env['arch'] == 'x86_32':
-    env.Append(CCFLAGS = ['-m32'])
-    env.Append(LINKFLAGS = ['-m32'])
-elif env['arch'] == 'x86_64':
-    env.Append(CXXFLAGS = ['-fPIC'])
-    env.Append(CCFLAGS = ['-m64'])
-    env.Append(LINKFLAGS = ['-m64'])
+elif 'x86' in env['arch']:
+    if env['estate'] == '32':
+        env.Append(CCFLAGS = ['-m32'])
+        env.Append(LINKFLAGS = ['-m32'])
+    else:
+        env.Append(CXXFLAGS = ['-fPIC'])
+        env.Append(CCFLAGS = ['-m64'])
+        env.Append(LINKFLAGS = ['-m64'])
+
+# Define toolchain
+prefix = ""
+if 'x86' not in env['arch']:
+    if env['estate'] == '32':
+        if env['os'] == 'linux':
+            prefix = "arm-linux-gnueabihf-" if 'v7' in env['arch'] else "armv8l-linux-gnueabihf-"
+        elif env['os'] == 'bare_metal':
+            prefix = "arm-eabi-"
+        elif env['os'] == 'android':
+            prefix = "arm-linux-androideabi-"
+    elif env['estate'] == '64' and 'v8' in env['arch']:
+        if env['os'] == 'linux':
+            prefix = "aarch64-linux-gnu-"
+        elif env['os'] == 'bare_metal':
+            prefix = "aarch64-elf-"
+        elif env['os'] == 'android':
+            prefix = "aarch64-linux-android-"
 
 if env['build'] == 'native':
     prefix = ""
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index a3601f6a25..179f1b6299 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -434,7 +434,7 @@ inline float16x8_t vexpq_f16(float16x8_t x)
     const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
     const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
 
-    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
+    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vexpq_f32(x_low)), vcvt_f16_f32(vexpq_f32(x_high)));
     return res;
 }
 
@@ -444,7 +444,7 @@ inline float16x8_t vlogq_f16(float16x8_t x)
     const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
     const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
 
-    const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high));
+    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vlogq_f32(x_low)), vcvt_f16_f32(vlogq_f32(x_high)));
     return res;
 }
 
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 835aa00a05..7f5aa5bdbe 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -2087,9 +2087,7 @@ PREDEFINED             = DOXYGEN_SKIP_THIS \
                          LOCATE_MIN \
                          LOCATE_MAX \
                          HAS_BIAS \
-                         POOL_AVG \
-                         ARM_COMPUTE_AARCH64_V8_2 \
-                         ARM_COMPUTE_AARCH64_V8A
+                         POOL_AVG
 
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 72afe4f054..191d182002 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -197,7 +197,7 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
         };
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         map_function["op_F16"] = &select_op_16<float16_t, uint16x8_t>;
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     }
     else
     {
@@ -213,7 +213,7 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
         };
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         map_function["op_F16"] = &select_op_not_same_rank<float16_t>;
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     }
 
     auto it = map_function.find(function_to_call);
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index 9409646818..16bdbb5986 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -83,12 +83,16 @@ void MergeResults<8, 6, false>(float *out, const float *in, const int ldout, con
                 switch ((y + 5) - ymax) {
                     case 4:
                         outptr1 = dummyres;
+                        /* fall through */
                     case 3:
                         outptr2 = dummyres;
+                        /* fall through */
                     case 2:
                         outptr3 = dummyres;
+                        /* fall through */
                     case 1:
                         outptr4 = dummyres;
+                        /* fall through */
                     case 0:
                         outptr5 = dummyres;
                         break;
-- 
cgit v1.2.1