From 663419457b02238687cb329afcddc73719bdb8fa Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 30 Jul 2021 12:21:07 +0100 Subject: Avoid over-allocation of temporary buffers within CpuWinogradConv2d Resolves: COMPMID-4716 Signed-off-by: Georgios Pinitas Change-Id: Ie036d2bb7a243301a62f089b3920ebee0f409190 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6028 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp | 13 ++++---- src/runtime/cpu/operators/CpuWinogradConv2d.cpp | 38 ++++++++---------------- src/runtime/cpu/utils/CpuAuxTensorHandler.h | 10 +++++-- 3 files changed, 26 insertions(+), 35 deletions(-) diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp index 74b031b226..5620d36e2c 100644 --- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp +++ b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp @@ -195,8 +195,7 @@ unsigned int CpuWinogradConv2dTransformWeightsKernel( - // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T - WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T)); + WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels)); } template @@ -298,14 +297,13 @@ unsigned int CpuWinogradConv2dTransformInputKernel(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T)); + return static_cast(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding)); } template unsigned int CpuWinogradConv2dTransformInputKernel::get_working_space_size(unsigned int num_threads) const { - return _transform->get_working_space_size(num_threads) / sizeof(T); + return _transform->get_working_space_size(num_threads); } template @@ -434,9 +432,8 @@ unsigned int CpuWinogradConv2dTransformOutputKernel( - WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T)); + WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels)); } template @@ -448,7 +445,7 @@ CpuWinogradConv2dTransformOutputKernel unsigned int CpuWinogradConv2dTransformOutputKernel::get_working_space_size(unsigned int num_threads) const { - return _transform->get_working_space_size(num_threads) / sizeof(T); + return _transform->get_working_space_size(num_threads); } template diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp index a734e1797c..ca7b004f3f 100644 --- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp @@ -549,12 +549,6 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _kernel_storage = b_info; _output_transformed = d_info; - // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), - dst->dimension(1), dst->dimension(3)), - 1, dst->data_type()); - _output_nhwc = info; - const ITensorInfo *input_to_use = src; ITensorInfo *output_to_use = dst; PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); @@ -573,7 +567,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, &_input_transformed, input_matrix_stride, &_input_workspace); const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); - TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, src->data_type()); + TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); _input_workspace = input_workspace_info; // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] @@ -587,6 +581,11 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method if(_data_layout == DataLayout::NCHW) { + // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), + dst->dimension(1), dst->dimension(3)), + 1, dst->data_type()); + _output_nhwc = info; output_to_use = &_output_nhwc; } const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info); @@ -603,7 +602,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei activation); const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); - TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, dst->data_type()); + TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); _output_workspace = output_workspace_info; // Reorder the convoluted output to ACL's ordering NCHW @@ -631,20 +630,12 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS]; _aux_mem[TempResult] = asm_mem_req[TempResult]; - _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Persistent, input_storage_size, storage_alignment); - _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Persistent, input_workspace_size); - if(_aux_mem[Pretranspose].size > 0) - { - // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - } - else - { - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, _weights_hwio.total_size()); - } + _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Temporary, input_storage_size, storage_alignment); + _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Temporary, input_workspace_size); + _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); _aux_mem[WeightsTransformed] = MemoryInfo(offset_int_vec(WeightsTransformed), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment); - _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Persistent, output_storage_size, storage_alignment); - _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Persistent, output_workspace_size); + _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Temporary, output_storage_size, storage_alignment); + _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Temporary, output_workspace_size); } Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, @@ -829,10 +820,7 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors) ITensorPack transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } }; NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors); - CpuAuxTensorHandler input_transformed(offset_int_vec(InputTransformed), _input_transformed, tensors, true); - CpuAuxTensorHandler output_transformed(offset_int_vec(OutputTransformed), _output_transformed, tensors, true); - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(ACL_SRC_0, input_transformed.get()); + ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get()); _gemm_function->prepare(gemm_pack); diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h index 0d1c927b5a..ae1cffb659 100644 --- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h +++ b/src/runtime/cpu/utils/CpuAuxTensorHandler.h @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Tensor.h" +#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -38,7 +39,7 @@ namespace cpu class CpuAuxTensorHandler { public: - CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false) + CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) : _tensor() { if(info.total_size() == 0) @@ -50,7 +51,12 @@ public: ITensor *packed_tensor = utils::cast::polymorphic_downcast(pack.get_tensor(slot_id)); if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) { - _tensor.allocator()->allocate(); + if(!bypass_alloc) + { + _tensor.allocator()->allocate(); + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); + } + if(pack_inject) { pack.add_tensor(slot_id, &_tensor); -- cgit v1.2.1