From a86433a64ab25c2ea8e274bd2f357a9709636f5b Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Wed, 28 Jul 2021 14:10:47 +0100
Subject: Reduce binary footprint of CpuConvertFullyConnectedWeightsKernel

Binary size reduction for this kernel is almost 50%.

Also remove unused NEConvertFullyConnectedWeightsManaged.

Change-Id: Ia46a1342a0737397b4aac2578d963c2ebb7446e3
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6011
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../functions/NEConvertFullyConnectedWeights.h     | 45 ++--------------------
 .../CpuConvertFullyConnectedWeightsKernel.cpp      | 41 ++++++--------------
 .../CpuConvertFullyConnectedWeightsKernel.h        |  9 -----
 .../functions/NEConvertFullyConnectedWeights.cpp   |  1 +
 4 files changed, 15 insertions(+), 81 deletions(-)
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index 218877d421..a892d3036b 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -25,13 +25,14 @@
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 
 #include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/ITransformWeights.h"
-#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */
 class NEConvertFullyConnectedWeights : public IFunction
@@ -84,45 +85,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-
-namespace weights_transformations
-{
-/** Basic function to manage @ref NEConvertFullyConnectedWeights. */
-class NEConvertFullyConnectedWeightsManaged : public ITransformWeights
-{
-public:
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    uint32_t uid() override
-    {
-        return _uid;
-    }
-
-    void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
-    {
-        _func.configure(input, &_output, original_input_shape, data_layout);
-    }
-
-private:
-    static constexpr uint32_t      _uid = 0x4;
-    Tensor                         _output{};
-    NEConvertFullyConnectedWeights _func{};
-};
-} // namespace weights_transformations
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
index 5bf70dc9bf..5406356bc9 100644
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
@@ -81,22 +81,6 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c
     return Status{};
 }
 
-template <typename T>
-void CpuConvertFullyConnectedWeightsKernel::run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window)
-{
-    const unsigned int dst_stride_x = out->info()->strides_in_bytes().x();
-    const unsigned int dst_stride_y = out->info()->strides_in_bytes().y();
-
-    Iterator input(in, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        *reinterpret_cast<T *>(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y) = *reinterpret_cast<T *>(input.ptr());
-    },
-    input);
-}
-
 void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -106,21 +90,18 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x();
+    const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y();
+    const unsigned int element_size = src->info()->element_size();
+
+    Iterator input(src, window);
+    Iterator output(dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        case 1:
-            run_convert_fc_weights<uint8_t>(src, dst, window);
-            break;
-        case 2:
-            run_convert_fc_weights<uint16_t>(src, dst, window);
-            break;
-        case 4:
-            run_convert_fc_weights<uint32_t>(src, dst, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported.");
-            break;
-    }
+        memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size);
+    },
+    input);
 }
 
 const char *CpuConvertFullyConnectedWeightsKernel::name() const
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 3ba3162c34..7baaf13417 100644
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -69,15 +69,6 @@ public:
 private:
     unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
     unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
-
-    /** Template function to run the permute
-     *
-     * @param[in] in     Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] out    The converted weights tensor info. Shape and Data Type: Same as @p in.
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window);
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f2253d8be4..1f6b3c94e2 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
+#include "arm_compute/core/Validate.h"
 #include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 namespace arm_compute
-- 
cgit v1.2.1