From 94f799e8f6f605333d40472860fb472e8ba6d83d Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 9 Jun 2021 16:37:32 +0100 Subject: Fix incorrect memory handling in ported functions Details of the functions: - ClSoftmax - CpuSoftmax - CpuPool2d Change-Id: Icd2c14d5df010c3b2301e2693ce6f414d7c61916 Resolves: COMPMID-4404 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5797 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/runtime/cpu/operators/CpuPool2d.cpp | 8 ++- src/runtime/cpu/operators/CpuPool2d.h | 2 +- src/runtime/cpu/operators/CpuSoftmax.cpp | 99 ++++++++++++++++----------- src/runtime/cpu/operators/CpuSoftmax.h | 32 ++++++--- src/runtime/cpu/utils/CpuAuxTensorHandler.h | 101 ++++++++++++++++++++++++++++ 5 files changed, 187 insertions(+), 55 deletions(-) create mode 100644 src/runtime/cpu/utils/CpuAuxTensorHandler.h (limited to 'src/runtime/cpu') diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp index b225199c40..e746c8fb3b 100644 --- a/src/runtime/cpu/operators/CpuPool2d.cpp +++ b/src/runtime/cpu/operators/CpuPool2d.cpp @@ -30,6 +30,8 @@ #include "src/core/cpu/kernels/CpuPool2dKernel.h" #include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" +using namespace arm_compute::experimental; + namespace arm_compute { namespace cpu @@ -40,7 +42,7 @@ CpuPool2d::CpuPool2d() _asm_glue(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW), - _mem_req() + _aux_mem(1) { } @@ -71,7 +73,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Get kernel's memory requirements constexpr size_t alignment = 4096; const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); + _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); _asm_glue = std::move(pooling_wrapper); } @@ -150,7 +152,7 @@ void CpuPool2d::run(ITensorPack &tensors) experimental::MemoryRequirements CpuPool2d::workspace() const { - return _mem_req; + return _aux_mem; } } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h index ae3d115dfc..68416b5cfc 100644 --- a/src/runtime/cpu/operators/CpuPool2d.h +++ b/src/runtime/cpu/operators/CpuPool2d.h @@ -80,7 +80,7 @@ private: bool _is_global_pooling_layer; DataLayout _data_layout; - experimental::MemoryRequirements _mem_req; + experimental::MemoryRequirements _aux_mem{}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuSoftmax.cpp b/src/runtime/cpu/operators/CpuSoftmax.cpp index 0e1bcd5c69..e17925ee50 100644 --- a/src/runtime/cpu/operators/CpuSoftmax.cpp +++ b/src/runtime/cpu/operators/CpuSoftmax.cpp @@ -29,7 +29,11 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/core/cpu/kernels/CpuSoftmaxKernel.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" +#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; namespace arm_compute { @@ -37,7 +41,16 @@ namespace cpu { template CpuSoftmaxGeneric::CpuSoftmaxGeneric() - : _permute_input(), _permute_output(), _max_kernel(), _softmax_kernel(), _max(nullptr), _tmp(nullptr), _input_permuted(nullptr), _output_permuted(nullptr), _needs_permute(false) + : _permute_input(), + _permute_output(), + _max_kernel(), + _softmax_kernel(), + _max(), + _tmp(), + _input_permuted(), + _output_permuted(), + _needs_permute(false), + _aux_mem(InternalTensorIdx::COUNT) { } @@ -54,13 +67,12 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d if(_needs_permute) { - _input_permuted = std::make_unique(); - _permute_input.configure(src, _input_permuted.get(), softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) // or it is the original input case (2D case) - const ITensorInfo *tmp_input = (_needs_permute ? _input_permuted.get() : src); + const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src); // Create intermediate tensors shapes TensorShape max_sum_shape = tmp_input->tensor_shape(); @@ -71,31 +83,35 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *d TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); // Init intermediate tensors - _max = std::make_unique(max_info); - _tmp = std::make_unique(tensor_info_tmp); + _max = TensorInfo(max_info); + _tmp = TensorInfo(tensor_info_tmp); // Configure kernels auto mk = std::make_unique(); - mk->configure(tmp_input, _max.get()); + mk->configure(tmp_input, &_max); _max_kernel = std::move(mk); auto sm = std::make_unique>(); if(_needs_permute) { - _output_permuted = std::make_unique(); - // The normalization kernel stores the result in a permuted output tensor - sm->configure(tmp_input, _max.get(), _output_permuted.get(), beta, _tmp.get()); + sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(_output_permuted.get(), dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } else { // Softmax 2D case - sm->configure(tmp_input, _max.get(), dst, beta, _tmp.get()); + sm->configure(tmp_input, &_max, dst, beta, &_tmp); } _softmax_kernel = std::move(sm); + + _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); + _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size()); } template @@ -141,42 +157,54 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, false); + CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, false); + + CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, false); + CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, false); + ITensorPack max_pack; ITensorPack softmax_pack; if(_needs_permute) { - ITensorPack permute_in_pack; - permute_in_pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC)); - permute_in_pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_INT_2)); + ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } }; _permute_input.run(permute_in_pack); - max_pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(ACL_INT_2)); + max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } }; - softmax_pack.add_tensor(TensorType::ACL_SRC_0, tensors.get_tensor(ACL_INT_2)); - softmax_pack.add_tensor(TensorType::ACL_SRC_1, tensors.get_tensor(ACL_INT_1)); - softmax_pack.add_tensor(TensorType::ACL_DST_0, tensors.get_tensor(ACL_INT_3)); - softmax_pack.add_tensor(TensorType::ACL_DST_1, tensors.get_tensor(ACL_INT_0)); + softmax_pack = + { + { TensorType::ACL_SRC_0, input_permuted.get() }, + { TensorType::ACL_SRC_1, max.get() }, + { TensorType::ACL_DST_0, output_permuted.get() }, + { TensorType::ACL_DST_1, tmp.get() } + }; } else { - max_pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC)); - softmax_pack.add_tensor(TensorType::ACL_SRC_0, tensors.get_const_tensor(ACL_SRC)); - softmax_pack.add_tensor(TensorType::ACL_SRC_1, tensors.get_tensor(ACL_INT_1)); - softmax_pack.add_tensor(TensorType::ACL_DST_0, tensors.get_tensor(ACL_DST)); - softmax_pack.add_tensor(TensorType::ACL_DST_1, tensors.get_tensor(ACL_INT_0)); + max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } }; + + softmax_pack = + { + { TensorType::ACL_SRC_0, src }, + { TensorType::ACL_SRC_1, max.get() }, + { TensorType::ACL_DST_0, dst }, + { TensorType::ACL_DST_1, tmp.get() } + }; } - max_pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_INT_1)); - NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); if(_needs_permute) { ITensorPack permute_out_pack; - permute_out_pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(ACL_INT_3)); - permute_out_pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); + permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get()); + permute_out_pack.add_tensor(TensorType::ACL_DST, dst); _permute_output.run(permute_out_pack); } } @@ -184,18 +212,7 @@ void CpuSoftmaxGeneric::run(ITensorPack &tensors) template experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const { - experimental::MemoryRequirements req{}; - - req.push_back({ TensorType::ACL_INT_0, _tmp->total_size(), 0 }); - req.push_back({ TensorType::ACL_INT_1, _max->total_size(), 0 }); - - if(_needs_permute) - { - req.push_back({ TensorType::ACL_INT_2, _input_permuted->total_size(), 0 }); - req.push_back({ TensorType::ACL_INT_3, _output_permuted->total_size(), 0 }); - } - - return req; + return _aux_mem; } template class CpuSoftmaxGeneric; diff --git a/src/runtime/cpu/operators/CpuSoftmax.h b/src/runtime/cpu/operators/CpuSoftmax.h index 9f18e0e4c5..38817977b3 100644 --- a/src/runtime/cpu/operators/CpuSoftmax.h +++ b/src/runtime/cpu/operators/CpuSoftmax.h @@ -24,7 +24,7 @@ #ifndef ARM_COMPUTE_CPU_SOFTMAX_H #define ARM_COMPUTE_CPU_SOFTMAX_H -#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/experimental/Types.h" #include "src/core/cpu/ICpuKernel.h" #include "src/runtime/cpu/ICpuOperator.h" @@ -87,15 +87,27 @@ public: experimental::MemoryRequirements workspace() const override; private: - CpuPermute _permute_input; - CpuPermute _permute_output; - std::unique_ptr _max_kernel; - std::unique_ptr _softmax_kernel; - std::unique_ptr _max; - std::unique_ptr _tmp; - std::unique_ptr _input_permuted; - std::unique_ptr _output_permuted; - bool _needs_permute; + enum InternalTensorIdx + { + MAX = 0, + TMP, + PERMUTED_SRC, + PERMUTED_DST, + COUNT + }; + + CpuPermute _permute_input; + CpuPermute _permute_output; + std::unique_ptr _max_kernel; + std::unique_ptr _softmax_kernel; + + TensorInfo _max; + TensorInfo _tmp; + TensorInfo _input_permuted; + TensorInfo _output_permuted; + + bool _needs_permute; + experimental::MemoryRequirements _aux_mem{}; }; using CpuSoftmax = CpuSoftmaxGeneric; using CpuLogSoftmax = CpuSoftmaxGeneric; diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h new file mode 100644 index 0000000000..644018a718 --- /dev/null +++ b/src/runtime/cpu/utils/CpuAuxTensorHandler.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H +#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Tensor.h" + +#include "support/Cast.h" + +namespace arm_compute +{ +namespace cpu +{ +/* Tensor handler to wrap and handle tensor allocations on workspace buffers */ +class CpuAuxTensorHandler +{ +public: + CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false) + : _tensor() + { + _tensor.allocator()->soft_init(info); + + ITensor *packed_tensor = utils::cast::polymorphic_downcast(pack.get_tensor(slot_id)); + if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + { + _tensor.allocator()->allocate(); + if(pack_inject) + { + pack.add_tensor(slot_id, &_tensor); + _injected_tensor_pack = &pack; + _injected_slot_id = slot_id; + } + } + else + { + _tensor.allocator()->import_memory(packed_tensor->buffer()); + } + } + + CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) + : _tensor() + { + _tensor.allocator()->soft_init(info); + if(info.total_size() <= tensor.info()->total_size()) + { + _tensor.allocator()->import_memory(tensor.buffer()); + } + } + + CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; + CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete; + + ~CpuAuxTensorHandler() + { + if(_injected_tensor_pack) + { + _injected_tensor_pack->remove_tensor(_injected_slot_id); + } + } + + ITensor *get() + { + return &_tensor; + } + + ITensor *operator()() + { + return &_tensor; + } + +private: + Tensor _tensor{}; + ITensorPack *_injected_tensor_pack{ nullptr }; + int _injected_slot_id{ TensorType::ACL_UNKNOWN }; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ \ No newline at end of file -- cgit v1.2.1