From a86433a64ab25c2ea8e274bd2f357a9709636f5b Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 28 Jul 2021 14:10:47 +0100 Subject: Reduce binary footprint of CpuConvertFullyConnectedWeightsKernel Binary size reduction for this kernel is almost 50%. Also remove unused NEConvertFullyConnectedWeightsManaged. Change-Id: Ia46a1342a0737397b4aac2578d963c2ebb7446e3 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6011 Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../functions/NEConvertFullyConnectedWeights.h | 45 ++-------------------- .../CpuConvertFullyConnectedWeightsKernel.cpp | 41 ++++++-------------- .../CpuConvertFullyConnectedWeightsKernel.h | 9 ----- .../functions/NEConvertFullyConnectedWeights.cpp | 1 + 4 files changed, 15 insertions(+), 81 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h index 218877d421..a892d3036b 100644 --- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h +++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h @@ -25,13 +25,14 @@ #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/ITransformWeights.h" -#include "arm_compute/runtime/Tensor.h" + +#include "arm_compute/core/Types.h" namespace arm_compute { // Forward declarations class ITensor; +class ITensorInfo; /** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */ class NEConvertFullyConnectedWeights : public IFunction @@ -84,45 +85,5 @@ private: struct Impl; std::unique_ptr _impl; }; - -namespace weights_transformations -{ -/** Basic function to manage @ref NEConvertFullyConnectedWeights. */ -class NEConvertFullyConnectedWeightsManaged : public ITransformWeights -{ -public: - void run() override - { - _output.allocator()->allocate(); - _func.run(); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - return _uid; - } - - void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout) - { - _func.configure(input, &_output, original_input_shape, data_layout); - } - -private: - static constexpr uint32_t _uid = 0x4; - Tensor _output{}; - NEConvertFullyConnectedWeights _func{}; -}; -} // namespace weights_transformations } // namespace arm_compute #endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */ diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp index 5bf70dc9bf..5406356bc9 100644 --- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp +++ b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp @@ -81,22 +81,6 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c return Status{}; } -template -void CpuConvertFullyConnectedWeightsKernel::run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window) -{ - const unsigned int dst_stride_x = out->info()->strides_in_bytes().x(); - const unsigned int dst_stride_y = out->info()->strides_in_bytes().y(); - - Iterator input(in, window); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - *reinterpret_cast(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y) = *reinterpret_cast(input.ptr()); - }, - input); -} - void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -106,21 +90,18 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(src->info()->element_size()) + const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x(); + const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y(); + const unsigned int element_size = src->info()->element_size(); + + Iterator input(src, window); + Iterator output(dst, window); + + execute_window_loop(window, [&](const Coordinates & id) { - case 1: - run_convert_fc_weights(src, dst, window); - break; - case 2: - run_convert_fc_weights(src, dst, window); - break; - case 4: - run_convert_fc_weights(src, dst, window); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported."); - break; - } + memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size); + }, + input); } const char *CpuConvertFullyConnectedWeightsKernel::name() const diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h index 3ba3162c34..7baaf13417 100644 --- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h +++ b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h @@ -69,15 +69,6 @@ public: private: unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ - - /** Template function to run the permute - * - * @param[in] in Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. - * @param[in] out The converted weights tensor info. Shape and Data Type: Same as @p in. - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window); }; } // namespace kernels } // namespace cpu diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp index f2253d8be4..1f6b3c94e2 100644 --- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp +++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" +#include "arm_compute/core/Validate.h" #include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h" namespace arm_compute -- cgit v1.2.1