From a50e702289af66944e860eafc7f3b32f6c5f30be Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Tue, 9 Apr 2019 14:03:17 +0100
Subject: COMPMID-2012: Remove unnecessary templates from NEON kernels

Change-Id: I2deb26188c7de7c6ad10d2f51f83e729fed7e5e2
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/961
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/NEON/kernels/NEReorgLayerKernel.h |  23 +---
 arm_compute/core/NEON/kernels/NEStackLayerKernel.h |  14 ---
 .../core/NEON/kernels/NEWeightsReshapeKernel.h     |  11 +-
 src/core/NEON/kernels/NEReorgLayerKernel.cpp       |  96 ++++++----------
 src/core/NEON/kernels/NEReverseKernel.cpp          |  18 +--
 src/core/NEON/kernels/NEStackLayerKernel.cpp       |  33 +-----
 src/core/NEON/kernels/NEWeightsReshapeKernel.cpp   | 128 ++++++++-------------
 7 files changed, 100 insertions(+), 223 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
index 7e0fb4350d..076af4fd1c 100644
--- a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,24 +75,9 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Template function to run the reorg
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_reorg(const Window &window);
-
-    /** Common signature for all the specialised reorg functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ReorgFunctionPtr = void (NEReorgLayerKernel::*)(const Window &window);
-
-private:
-    ReorgFunctionPtr _func;
-    const ITensor   *_input;
-    ITensor         *_output;
-    int32_t          _stride;
+    const ITensor *_input;
+    ITensor       *_output;
+    int32_t        _stride;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEREORGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
index 3a9e81fa94..42a0539c9f 100644
--- a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
@@ -84,24 +84,10 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Template function to run the stack
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_stack(const Window &window);
-
-    /** Common signature for all the specialised stack functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using StackFunctionPtr = void (NEStackLayerKernel::*)(const Window &window);
-
     const ITensor *_input;
     ITensor       *_output;
     unsigned int   _axis;
     unsigned int   _idx_input;
-    StackFunctionPtr _func;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NESTACKLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index 21f36f6c2b..bba18a8fa8 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,12 +99,9 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
-
-    WeightsReshapeKernel *_func;
-    const ITensor        *_input;
-    const ITensor        *_bias;
-    ITensor              *_output;
+    const ITensor *_input;
+    const ITensor *_bias;
+    ITensor       *_output;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 8baea2b990..ece5aa431c 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,47 +67,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-template <typename T>
-void NEReorgLayerKernel::run_reorg(const Window &window)
-{
-    const DataLayout data_layout = _input->info()->data_layout();
-    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t     idx_c       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int stride = _stride;
-    const unsigned int out_c  = _output->info()->tensor_shape()[idx_c] / (stride * stride);
-    const uint8_t     *in_ptr = _input->buffer();
-
-    // Collapse
-    Window collapsed_window = window.collapse_if_possible(window, 4);
-
-    // Create Iterator
-    Iterator out(_output, collapsed_window);
-
-    // Perform reorg
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        // Get spatial coords and channels
-        const unsigned int w = id[idx_w];
-        const unsigned int h = id[idx_h];
-        const unsigned int c = id[idx_c];
-
-        // Calculate mapping
-        const unsigned int offset     = c / out_c;
-        Coordinates        map_coords = id;
-        map_coords.set(idx_w, w * stride + offset % stride);
-        map_coords.set(idx_h, h * stride + offset / stride);
-        map_coords.set(idx_c, c % out_c);
-
-        // Perform mapping
-        *(reinterpret_cast<T *>(out.ptr())) = *(reinterpret_cast<const T *>(in_ptr + _input->info()->offset_element_in_bytes(map_coords)));
-    },
-    out);
-}
-
 NEReorgLayerKernel::NEReorgLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _stride(1)
+    : _input(nullptr), _output(nullptr), _stride(1)
 {
 }
 
@@ -122,27 +83,10 @@ void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
 
-    _func   = nullptr;
     _input  = input;
     _output = output;
     _stride = stride;
 
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEReorgLayerKernel::run_reorg<uint8_t>;
-            break;
-        case 2:
-            _func = &NEReorgLayerKernel::run_reorg<uint16_t>;
-            break;
-        case 4:
-            _func = &NEReorgLayerKernel::run_reorg<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
     // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
     output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
@@ -164,9 +108,39 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_func != nullptr)
+    const DataLayout data_layout = _input->info()->data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t     idx_c       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const unsigned int stride = _stride;
+    const unsigned int out_c  = _output->info()->tensor_shape()[idx_c] / (stride * stride);
+    const uint8_t     *in_ptr = _input->buffer();
+
+    // Collapse
+    Window collapsed_window = window.collapse_if_possible(window, 4);
+
+    // Create Iterator
+    Iterator out(_output, collapsed_window);
+
+    // Perform reorg
+    execute_window_loop(collapsed_window, [&](const Coordinates & id)
     {
-        (this->*_func)(window);
-    }
+        // Get spatial coords and channels
+        const unsigned int w = id[idx_w];
+        const unsigned int h = id[idx_h];
+        const unsigned int c = id[idx_c];
+
+        // Calculate mapping
+        const unsigned int offset     = c / out_c;
+        Coordinates        map_coords = id;
+        map_coords.set(idx_w, w * stride + offset % stride);
+        map_coords.set(idx_h, h * stride + offset / stride);
+        map_coords.set(idx_c, c % out_c);
+
+        // Perform mapping
+        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
+    },
+    out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 62e48829f6..36398cf89a 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -189,31 +189,21 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
     switch(_input->info()->data_type())
     {
         case DataType::F32:
-            run_reverse<float>(window, _input, _axis, _output);
+        case DataType::U32:
+        case DataType::S32:
+            run_reverse<uint32_t>(window, _input, _axis, _output);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            run_reverse<float16_t>(window, _input, _axis, _output);
-            break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::U32:
-            run_reverse<uint32_t>(window, _input, _axis, _output);
-            break;
-        case DataType::S32:
-            run_reverse<int32_t>(window, _input, _axis, _output);
-            break;
         case DataType::S16:
-            run_reverse<int16_t>(window, _input, _axis, _output);
-            break;
         case DataType::U16:
             run_reverse<uint16_t>(window, _input, _axis, _output);
             break;
         case DataType::QASYMM8:
         case DataType::U8:
-            run_reverse<uint8_t>(window, _input, _axis, _output);
-            break;
         case DataType::S8:
-            run_reverse<int8_t>(window, _input, _axis, _output);
+            run_reverse<uint8_t>(window, _input, _axis, _output);
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 0c33f36983..3447d59bcc 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -87,7 +87,7 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id,
 } // namespace
 
 NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input(), _func(nullptr)
+    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
 {
 }
 
@@ -101,22 +101,6 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi
     _axis      = axis;
     _idx_input = idx_input;
 
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEStackLayerKernel::run_stack<uint8_t>;
-            break;
-        case 2:
-            _func = &NEStackLayerKernel::run_stack<uint16_t>;
-            break;
-        case 4:
-            _func = &NEStackLayerKernel::run_stack<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
 
@@ -137,15 +121,6 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
-    {
-        (this->*_func)(window);
-    }
-}
-
-template <typename T>
-void NEStackLayerKernel::run_stack(const Window &window)
-{
     Window window_out;
     window_out.use_tensor_dimensions(_output->info()->tensor_shape());
 
@@ -160,9 +135,9 @@ void NEStackLayerKernel::run_stack(const Window &window)
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        Coordinates id_out                           = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx                              = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        *(reinterpret_cast<T *>(output.ptr() + idx)) = *(reinterpret_cast<const T *>(input.ptr()));
+        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
+        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
     },
     input);
 }
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 4a0cf27592..624833adfb 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -34,59 +34,6 @@ using namespace arm_compute;
 
 namespace
 {
-template <typename T>
-void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
-{
-    const unsigned int kernel_size_x   = input->info()->dimension(0);
-    const unsigned int kernel_size_y   = input->info()->dimension(1);
-    const unsigned int kernel_depth    = input->info()->dimension(2);
-    const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(input, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
-                {
-                    *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(bias != nullptr)
-        {
-            *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
-        }
-    },
-    in);
-}
-
 TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 {
     TensorShape output_shape{ input->tensor_shape() };
@@ -141,7 +88,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 NEWeightsReshapeKernel::NEWeightsReshapeKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
 
@@ -161,30 +108,6 @@ void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias
     _bias   = bias;
     _output = output;
 
-    switch(_input->info()->element_size())
-    {
-        case 4:
-        {
-            _func = &weights_reshape<uint32_t>;
-            break;
-        }
-        case 2:
-        {
-            _func = &weights_reshape<uint16_t>;
-            break;
-        }
-        case 1:
-        {
-            _func = &weights_reshape<uint8_t>;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR_ON("Element size not supported");
-            break;
-        }
-    }
-
     // Configure kernel
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -205,5 +128,52 @@ void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    (*_func)(_input, _bias, _output, window);
+    const unsigned int kernel_size_x   = _input->info()->dimension(0);
+    const unsigned int kernel_size_y   = _input->info()->dimension(1);
+    const unsigned int kernel_depth    = _input->info()->dimension(2);
+    const unsigned int input_stride_x  = _input->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = _input->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = _input->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(_input, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                {
+                    std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(_bias != nullptr)
+        {
+            std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
+        }
+    },
+    in);
 }
-- 
cgit v1.2.1