diff options
Diffstat (limited to 'src/cpu')
-rw-r--r-- | src/cpu/operators/CpuGemmConv2d.cpp | 29 | ||||
-rw-r--r-- | src/cpu/utils/CpuAuxTensorHandler.h | 78 |
2 files changed, 88 insertions, 19 deletions
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp index 31c873c2ba..7460f2020c 100644 --- a/src/cpu/operators/CpuGemmConv2d.cpp +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Arm Limited. + * Copyright (c) 2021-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -839,23 +839,26 @@ void CpuGemmConv2d::run(ITensorPack &tensors) auto weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1); ARM_COMPUTE_ERROR_ON_NULLPTR(weights); // Re-interpreted weights. Only tensor shape is changed. Only memory import, no allocation + const bool use_reinterpreted_wei = (_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose); CpuAuxTensorHandler reinterpreted_wei( _weights_reshaped, *weights, /* import only if we chose the ReinterpretThenTranspose path, because otherwise the weight may have been freed */ - !(_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose)); - CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); + !use_reinterpreted_wei); + + const bool use_reshaped_wei = (_run_wt && (_wt_method == WeightTransformMethod::ReshapeThenTranspose || + _wt_method == WeightTransformMethod::FusedReshapeAndTranspose)); + CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, + false /* pack_inject */, !use_reshaped_wei /* bypass_alloc */, + !use_reshaped_wei /* bypass_import */ + ); // Update the weights to use if it has been reshaped - if (_run_wt) + if (use_reinterpreted_wei) { - if (_wt_method == WeightTransformMethod::ReinterpretThenTranspose) - { - gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get()); - } - else if (_wt_method == WeightTransformMethod::ReshapeThenTranspose || - _wt_method == WeightTransformMethod::FusedReshapeAndTranspose) - { - gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); - } + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get()); + } + else if (use_reshaped_wei) + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); } // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h index 0a39fdba81..3b980ce60b 100644 --- a/src/cpu/utils/CpuAuxTensorHandler.h +++ b/src/cpu/utils/CpuAuxTensorHandler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2023 Arm Limited. + * Copyright (c) 2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,12 +35,74 @@ namespace arm_compute { namespace cpu { -/* Tensor handler to wrap and handle tensor allocations on workspace buffers */ +/** Tensor handler to wrap and handle tensor allocations on workspace buffers + * + * @note Important: Despite the impression given by its name, the handler owns, rather than merely points to, the + * underlying tensor memory. + * + * @note About memory handling using bypass_* flags + * The bypass_alloc / bypass_import flags are meant to skip the expensive auxiliary tensor memory allocations or + * imports that are not needed during runtime, e.g. when the handler is not used at all in some branch of execution. + * + * If not handled correctly, these two flags can lead to performance issues (not bypass when needed to), or memory + * bugs (bypass when should not to). + * + * Make sure: + * + * 1. The aux tensor handlers must always be declared at the root level, or the same level as the run/prepare + * methods that potentially use them. + * + * Once the handler is destroyed (e.g. when going out of scope), the memory it owns (returned by the get() + * method) will also be destroyed. + * + * Thus it's important to ensure the handler is always in-scope when it is being used by a operator / kernel. + * + * 2. The handler's bypass_alloc and bypass_import flags should always be inverse of whether the handler is used in + * its surrounding scope by run/prepare. (This usually means being added to some tensor pack) + * + * This ensures we only bypass if and only if the aux tensor is not used by the op / kernel later. + * + * + * So the general usage pattern goes like this: + * + * bool use_aux_tensor = some_condition_about_when_to_use_the_aux_tensor + * + * CpuAuxTensorHandler aux_handler {..., !use_aux_tensor || bypass_alloc / bypass_import ||}; + * + * if (use_aux_tensor) + * { + * tensor_pack.add_tensor(aux_handler.get()); + * } + * op.run(tensor_pack); + */ class CpuAuxTensorHandler { public: - CpuAuxTensorHandler( - int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) + /** Create a temporary tensor handle, by either important an existing tensor from a tensor pack, or allocating a + * new one. + * + * @param[in] slot_id Slot id of the tensor to be retrieved in the tensor pack + * If no such tensor exists in the tensor pack, a new tensor will be allocated. + * @param[in] info Tensor info containing requested size of the new tensor. + * If requested size is larger than the tensor retrieved from the tensor pack, + * a new tensor will be allocated. + * @param[in,out] pack Tensor pack to retrieve the old tensor. When @p pack_inject is true, the new + * tensor will also be added here. + * @param[in] pack_inject In case of a newly allocated tensor, whether to add this tensor back to the + * @p pack + * @param[in] bypass_alloc Bypass allocation in case of a new tensor + * This is to prevent unnecessary memory operations when the handler object is not + * used + * @param[in] bypass_import Bypass importation in case of a retrieved tensor + * This is to prevent unnecessary memory operations when the handler object is not + * used + */ + CpuAuxTensorHandler(int slot_id, + TensorInfo &info, + ITensorPack &pack, + bool pack_inject = false, + bool bypass_alloc = false, + bool bypass_import = false) : _tensor() { if (info.total_size() == 0) @@ -67,7 +129,10 @@ public: } else { - _tensor.allocator()->import_memory(packed_tensor->buffer()); + if (!bypass_import) + { + _tensor.allocator()->import_memory(packed_tensor->buffer()); + } } } @@ -76,7 +141,8 @@ public: * * @param[in] info New tensor info to "assign" to @p tensor * @param[in] tensor Tensor to be assigned a new @ref TensorInfo - * @param[in] bypass_import Bypass importing @p tensor's memory into the handler + * @param[in] bypass_import Bypass importing @p tensor's memory into the handler. + * This is to prevent unnecessary memory operations when the handler object is not used */ CpuAuxTensorHandler(TensorInfo &info, const ITensor &tensor, bool bypass_import = false) : _tensor() { |