From 2ffc85eb0dead5f623d503ed5d2f0a94aba57484 Mon Sep 17 00:00:00 2001
From: Pablo Marquez Tello <pablo.tello@arm.com>
Date: Wed, 13 Sep 2023 11:10:52 +0100
Subject: GenerateProposals changes to enable fp16 in armv8a multi_isa builds

    * Code guarded with __ARM_FEATURE_FP16_VECTOR_ARITHMETIC needs
      to be moved to an fp16.cpp file to allow compilation with
      -march=armv8.2-a+fp16

    * fp16.cpp needs to use the template compute_all_anchors() that
      had to be moved from impl.cpp to impl.h

    * Partially resolves MLCE-1102

Change-Id: Iaff6da32d0b9789ef87ba3f95bef99343612bd01
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10309
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/cpu/kernels/genproposals/generic/neon/impl.cpp | 36 +---------------------
 src/cpu/kernels/genproposals/generic/neon/impl.h   | 33 +++++++++++++++++---
 2 files changed, 30 insertions(+), 39 deletions(-)
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
index 824e85adac..9224e32a94 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,40 +28,6 @@ class ITensor;
 class Window;
 namespace cpu
 {
-template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
-{
-    Iterator all_anchors_it(all_anchors, window);
-    Iterator anchors_it(all_anchors, window);
-
-    const size_t num_anchors = anchors->info()->dimension(1);
-    const T      stride      = 1.f / anchors_info.spatial_scale();
-    const size_t feat_width  = anchors_info.feat_width();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
-
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
-
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
-
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
-}
-
-template void compute_all_anchors<float>(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void compute_all_anchors<float16_t>(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
 void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 {
     Iterator all_anchors_it(all_anchors, window);
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
index 88f5e52020..da052c9192 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.h
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,16 +24,41 @@
 #ifndef SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
 #define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
-class ITensor;
-class Window;
 namespace cpu
 {
 template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+{
+    Iterator all_anchors_it(all_anchors, window);
+    Iterator anchors_it(all_anchors, window);
+
+    const size_t num_anchors = anchors->info()->dimension(1);
+    const T      stride      = 1.f / anchors_info.spatial_scale();
+    const size_t feat_width  = anchors_info.feat_width();
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t anchor_offset = id.y() % num_anchors;
+
+        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+        const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+
+        const size_t shift_idy = id.y() / num_anchors;
+        const T      shiftx    = (shift_idy % feat_width) * stride;
+        const T      shifty    = (shift_idy / feat_width) * stride;
+
+        *out_anchor_ptr       = *anchor_ptr + shiftx;
+        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+    },
+    all_anchors_it);
+}
 
 void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
 } // namespace cpu
-- 
cgit v1.2.1