diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-07-30 12:21:07 +0100 |
---|---|---|
committer | Sheri Zhang <sheri.zhang@arm.com> | 2021-08-04 11:04:22 +0000 |
commit | 663419457b02238687cb329afcddc73719bdb8fa (patch) | |
tree | 040abeb9878085df78841cc907036b9293e7a644 /src/runtime/cpu | |
parent | 4164814a099773c0a512889473c980bc148e590f (diff) | |
download | ComputeLibrary-663419457b02238687cb329afcddc73719bdb8fa.tar.gz |
Avoid over-allocation of temporary buffers within CpuWinogradConv2d
Resolves: COMPMID-4716
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ie036d2bb7a243301a62f089b3920ebee0f409190
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6028
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/runtime/cpu')
-rw-r--r-- | src/runtime/cpu/operators/CpuWinogradConv2d.cpp | 38 | ||||
-rw-r--r-- | src/runtime/cpu/utils/CpuAuxTensorHandler.h | 10 |
2 files changed, 21 insertions, 27 deletions
diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp index a734e1797c..ca7b004f3f 100644 --- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp @@ -549,12 +549,6 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _kernel_storage = b_info; _output_transformed = d_info; - // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), - dst->dimension(1), dst->dimension(3)), - 1, dst->data_type()); - _output_nhwc = info; - const ITensorInfo *input_to_use = src; ITensorInfo *output_to_use = dst; PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); @@ -573,7 +567,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, &_input_transformed, input_matrix_stride, &_input_workspace); const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); - TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, src->data_type()); + TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); _input_workspace = input_workspace_info; // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] @@ -587,6 +581,11 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method if(_data_layout == DataLayout::NCHW) { + // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), + dst->dimension(1), dst->dimension(3)), + 1, dst->data_type()); + _output_nhwc = info; output_to_use = &_output_nhwc; } const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info); @@ -603,7 +602,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei activation); const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); - TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, dst->data_type()); + TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); _output_workspace = output_workspace_info; // Reorder the convoluted output to ACL's ordering NCHW @@ -631,20 +630,12 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS]; _aux_mem[TempResult] = asm_mem_req[TempResult]; - _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Persistent, input_storage_size, storage_alignment); - _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Persistent, input_workspace_size); - if(_aux_mem[Pretranspose].size > 0) - { - // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - } - else - { - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, _weights_hwio.total_size()); - } + _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Temporary, input_storage_size, storage_alignment); + _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Temporary, input_workspace_size); + _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); _aux_mem[WeightsTransformed] = MemoryInfo(offset_int_vec(WeightsTransformed), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment); - _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Persistent, output_storage_size, storage_alignment); - _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Persistent, output_workspace_size); + _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Temporary, output_storage_size, storage_alignment); + _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Temporary, output_workspace_size); } Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, @@ -829,10 +820,7 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors) ITensorPack transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } }; NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors); - CpuAuxTensorHandler input_transformed(offset_int_vec(InputTransformed), _input_transformed, tensors, true); - CpuAuxTensorHandler output_transformed(offset_int_vec(OutputTransformed), _output_transformed, tensors, true); - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(ACL_SRC_0, input_transformed.get()); + ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get()); _gemm_function->prepare(gemm_pack); diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h index 0d1c927b5a..ae1cffb659 100644 --- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h +++ b/src/runtime/cpu/utils/CpuAuxTensorHandler.h @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Tensor.h" +#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -38,7 +39,7 @@ namespace cpu class CpuAuxTensorHandler { public: - CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false) + CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) : _tensor() { if(info.total_size() == 0) @@ -50,7 +51,12 @@ public: ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id)); if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) { - _tensor.allocator()->allocate(); + if(!bypass_alloc) + { + _tensor.allocator()->allocate(); + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); + } + if(pack_inject) { pack.add_tensor(slot_id, &_tensor); |