From eb5696d99d85e1d402188151e021bc4b14f93969 Mon Sep 17 00:00:00 2001
From: Anitha Raj <anitha.raj@arm.com>
Date: Fri, 14 Jul 2023 11:19:34 +0100
Subject: Optimize CpuReshapeKernel

Resolves COMPMID-5279

Change-Id: Id9b007eed62c200702bbfcc83b94dab7b5de1714
Signed-off-by: Anitha Raj <anitha.raj@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9962
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/helpers/Utils.h             |  31 +++++-
 src/cpu/kernels/CpuReshapeKernel.cpp | 187 ++++++++++++++++++++++++++++-------
 src/cpu/kernels/CpuReshapeKernel.h   |  24 ++++-
 src/cpu/operators/CpuReshape.cpp     |  16 ++-
 src/cpu/operators/CpuReshape.h       |   9 +-
 5 files changed, 226 insertions(+), 41 deletions(-)

(limited to 'src')
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 326dc962c7..641d536c13 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2020-2021 Arm Limited.
+* Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define SRC_CORE_HELPERS_UTILS_H
 
 #include "arm_compute/core/ITensorInfo.h"
-
 namespace arm_compute
 {
 /** Create a strides object based on the provided strides and the tensor dimensions.
@@ -38,7 +37,7 @@ namespace arm_compute
  *         calculated based on the tensor shape and the strides of lower dimensions.
  */
 template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixed_strides)
 {
     const TensorShape &shape = info.tensor_shape();
 
@@ -92,6 +91,32 @@ inline unsigned int get_next_power_two(unsigned int x)
 
     return x;
 }
+
+/** Check if the tensor has any holes.
+ *
+ * @param[in] info      Tensor info object defining the shape of the input tensor.
+ * @param[in] dimension Highest dimension to check.
+ *
+ * @note This function checks for holes in all the dimensions upto and including the highest dimension.
+ *
+ */
+inline bool has_holes(const ITensorInfo &info, size_t dimension)
+{
+    const auto &shape          = info.tensor_shape();
+    const auto &strides        = info.strides_in_bytes();
+    size_t      squashed_bytes = info.element_size();
+
+    for(size_t dim = 0; dim <= dimension; ++dim)
+    {
+        if(strides[dim] != squashed_bytes)
+        {
+            return true;
+        }
+        squashed_bytes *= shape[dim];
+    }
+    return false;
+}
+
 } // namespace arm_compute
 
 #endif /* SRC_CORE_HELPERS_UTILS_H */
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index 068ff07efa..a9672a8c5e 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include <cstdint>
 
 /** [NEReshapeLayerKernel Kernel] **/
@@ -61,21 +59,109 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     return Status{};
 }
 
+
 template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
+void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
+{
+    const TensorShape &src_shape = src->info()->tensor_shape();
+    const TensorShape &dst_shape = dst->info()->tensor_shape();
+
+    Iterator    dst_it(dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & dst_coord)
+    {
+        Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+        const auto output_ptr = dst->ptr_to_element(dst_coord);
+        const auto input_ptr  = src->ptr_to_element(src_coord);
+
+        *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+    },
+    dst_it);
+}
+
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst )
+{
+    switch(src->info()->data_type())
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            reshape_tensor_per_element<uint8_t>(window, src, dst);
+            break;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+            reshape_tensor_per_element<uint16_t>(window, src, dst);
+            break;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            reshape_tensor_per_element<uint32_t>(window, src, dst);
+            break;
+        case DataType::U64:
+        case DataType::S64:
+        case DataType::F64:
+            reshape_tensor_per_element<uint64_t>(window, src, dst);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}
+
+void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst)
 {
     const TensorShape &src_shape = src->info()->tensor_shape();
     const TensorShape &dst_shape = dst->info()->tensor_shape();
+    Coordinates        src_coord{};
     Coordinates        dst_coord{};
 
-    Iterator src_it(src, window);
+    const auto element_size      = dst->info()->element_size();
+    const auto window_start_x    = static_cast<int>(window.x().start());
+    const auto window_end_x      = static_cast<int>(window.x().end());
+    const auto src_row_size      = static_cast<int>(src_shape[0]);
+    const auto row_size_in_bytes = src_row_size * element_size;
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    auto output_ptr = dst->ptr_to_element(dst_coord);
+    auto input_ptr  = src->ptr_to_element(src_coord);
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator dst_it(dst, win);
+    execute_window_loop(win, [&]( Coordinates & id)
     {
-        dst_coord                                              = index2coords(dst_shape, coords2index(src_shape, id));
-        *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
+        dst_coord = id;
+
+        for(int x = window_start_x; x < window_end_x; x += src_row_size)
+        {
+            src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+            output_ptr = dst->ptr_to_element(dst_coord);
+            input_ptr  = src->ptr_to_element(src_coord);
+
+            std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+            dst_coord.increment(Window::DimX, src_row_size);
+        }
     },
-    src_it);
+    dst_it);
+}
+
+void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
+{
+    Iterator src_it(src, window);
+    Iterator dst_it(dst, window);
+
+    const size_t element_size         = dst->info()->element_size();
+    const auto   window_size          = window.x().end() - window.x().start();
+    const auto   window_size_in_bytes = window_size * element_size;
+
+    const auto input_ptr  = src_it.ptr();
+    const auto output_ptr = dst_it.ptr();
+
+    std::memcpy(output_ptr, input_ptr, window_size_in_bytes);
 }
 } // namespace
 
@@ -83,10 +169,11 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    ARM_COMPUTE_UNUSED(dst);
+    ARM_COMPUTE_UNUSED(src);
 
+    _reshape_tensor_fn = reshape_tensor_per_element_selector;
     // Configure kernel window
-    Window win = calculate_max_window(*src);
+    Window win = calculate_max_window(*dst);
 
     ICpuKernel::configure(win);
 }
@@ -94,7 +181,6 @@ void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
     return Status{};
 }
 
@@ -106,28 +192,7 @@ void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->data_type())
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, src, dst);
-            break;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::F16:
-            reshape_tensor<uint16_t>(window, src, dst);
-            break;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            reshape_tensor<uint32_t>(window, src, dst);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
+    _reshape_tensor_fn(window, src, dst);
 }
 
 const char *CpuReshapeKernel::name() const
@@ -143,6 +208,58 @@ size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
     return ICPPKernel::default_mws;
 }
 
+void CpuReshapeKernel::prepare(ITensorPack &tensors)
+{
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const ITensorInfo* src_info = src->info();
+    const ITensorInfo* dst_info = dst->info();
+
+    // Calculate kernel window based on the padding info
+    Window win;
+
+    const bool src_has_holes      = has_holes(*src_info, src_info->num_dimensions() - 1);
+    const bool dst_has_holes      = has_holes(*dst_info, dst_info->num_dimensions() - 1);
+    const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX);
+    const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX);
+    const auto src_row_size       = static_cast<int>(src_info->tensor_shape()[0]);
+    const auto dst_row_size       = static_cast<int>(dst_info->tensor_shape()[0]);
+
+    if(!src_has_holes && !dst_has_holes)
+    {
+        std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
+        /*
+            Copy the tensor per window. If the src and dst tensors
+            are contiguous memory allocations without any holes or
+            padding, then the tensor is squashed to 1D window and
+            we can use use a single memcopy call to copy the whole
+            window in reshape_tensor_per_window fn
+        */
+        _reshape_tensor_fn = reshape_tensor_per_window;
+    }
+    else
+    {
+        win = calculate_max_window(*dst_info);
+        /*
+            Copy tensor row by row if src and dst have no holes in X
+            dim and they have the same number of elements in their rows
+        */
+        if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size))
+        {
+            _reshape_tensor_fn = reshape_tensor_per_row;
+        }
+        else
+        {
+            /*
+                Fall back to the element wise copy
+            */
+            _reshape_tensor_fn = reshape_tensor_per_element_selector;
+        }
+    }
+
+    ICPPKernel::configure(win);
+}
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index 17302c6731..eddbbf7135 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,6 +58,13 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
+    /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
+     *
+     * @param[in] tensors Pack of input and output tensors
+     *
+     */
+    void prepare(ITensorPack &tensors);
+
     /** Return minimum workload size of the relevant kernel
      *
      * @param[in] platform     The CPU platform used to create the context.
@@ -66,6 +73,21 @@ public:
      * @return[out] small_network_mws          Minimum workload size for requsted configuration.
      */
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+      *
+      * @return The split dimension.
+      */
+    size_t get_split_dimension() const
+    {
+        return _split_dimension;
+    }
+
+private:
+    size_t               _split_dimension{ Window::DimY };
+
+    std::function<void(const Window &window, const ITensor *src, ITensor *dst )>  _reshape_tensor_fn{};
+
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index 79e7b8fe6e..e6892a2e7e 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 
 #include "src/common/utils/Log.h"
 
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -43,5 +45,17 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     return kernels::CpuReshapeKernel::validate(src, dst);
 }
+
+void CpuReshape::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    if(!_is_prepared)
+    {
+        static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
+        _is_prepared = true;
+    }
+    const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 92dcb09aa9..9bc43e7db4 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_RESHAPE_H
 
 #include "src/cpu/ICpuOperator.h"
+#include "arm_compute/core/Window.h"
 
 namespace arm_compute
 {
@@ -47,6 +48,12 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    bool    _is_prepared{ false } ;
 };
 } // namespace cpu
 } // namespace arm_compute
-- 
cgit v1.2.1