aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-30 12:21:07 +0100
committerSheri Zhang <sheri.zhang@arm.com>2021-08-04 11:04:22 +0000
commit663419457b02238687cb329afcddc73719bdb8fa (patch)
tree040abeb9878085df78841cc907036b9293e7a644
parent4164814a099773c0a512889473c980bc148e590f (diff)
downloadComputeLibrary-663419457b02238687cb329afcddc73719bdb8fa.tar.gz
Avoid over-allocation of temporary buffers within CpuWinogradConv2d
Resolves: COMPMID-4716 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: Ie036d2bb7a243301a62f089b3920ebee0f409190 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6028 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp13
-rw-r--r--src/runtime/cpu/operators/CpuWinogradConv2d.cpp38
-rw-r--r--src/runtime/cpu/utils/CpuAuxTensorHandler.h10
3 files changed, 26 insertions, 35 deletions
diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
index 74b031b226..5620d36e2c 100644
--- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -195,8 +195,7 @@ unsigned int CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTi
{
const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels);
return static_cast<unsigned int>(
- // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
- WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T));
+ WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels));
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -298,14 +297,13 @@ unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTile
// Construct shapes for the input and kernel tensors.
const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels);
const KernelShape kern_shape(1, KernelRows, KernelCols, num_channels);
- // Return the size, converted into units of TIn
- return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T));
+ return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding));
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
{
- return _transform->get_working_space_size(num_threads) / sizeof(T);
+ return _transform->get_working_space_size(num_threads);
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -434,9 +432,8 @@ unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTil
// Construct shapes for the input and kernel tensors.
const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1);
const KernelShape kern_shape(num_output_channels, KernelRows, KernelCols, 1);
- // Return the size, converted into units of TOut
return static_cast<unsigned int>(
- WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T));
+ WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels));
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -448,7 +445,7 @@ CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, Kernel
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
{
- return _transform->get_working_space_size(num_threads) / sizeof(T);
+ return _transform->get_working_space_size(num_threads);
}
template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
index a734e1797c..ca7b004f3f 100644
--- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
@@ -549,12 +549,6 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
_kernel_storage = b_info;
_output_transformed = d_info;
- // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
- dst->dimension(1), dst->dimension(3)),
- 1, dst->data_type());
- _output_nhwc = info;
-
const ITensorInfo *input_to_use = src;
ITensorInfo *output_to_use = dst;
PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
@@ -573,7 +567,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
&_input_transformed, input_matrix_stride, &_input_workspace);
const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
- TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, src->data_type());
+ TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
_input_workspace = input_workspace_info;
// Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
@@ -587,6 +581,11 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
// The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
if(_data_layout == DataLayout::NCHW)
{
+ // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+ TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
+ dst->dimension(1), dst->dimension(3)),
+ 1, dst->data_type());
+ _output_nhwc = info;
output_to_use = &_output_nhwc;
}
const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
@@ -603,7 +602,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
activation);
const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
- TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, dst->data_type());
+ TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
_output_workspace = output_workspace_info;
// Reorder the convoluted output to ACL's ordering NCHW
@@ -631,20 +630,12 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
_aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS];
_aux_mem[TempResult] = asm_mem_req[TempResult];
- _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Persistent, input_storage_size, storage_alignment);
- _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Persistent, input_workspace_size);
- if(_aux_mem[Pretranspose].size > 0)
- {
- // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
- }
- else
- {
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, _weights_hwio.total_size());
- }
+ _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Temporary, input_storage_size, storage_alignment);
+ _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Temporary, input_workspace_size);
+ _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
_aux_mem[WeightsTransformed] = MemoryInfo(offset_int_vec(WeightsTransformed), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment);
- _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Persistent, output_storage_size, storage_alignment);
- _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Persistent, output_workspace_size);
+ _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Temporary, output_storage_size, storage_alignment);
+ _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Temporary, output_workspace_size);
}
Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
@@ -829,10 +820,7 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors)
ITensorPack transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } };
NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors);
- CpuAuxTensorHandler input_transformed(offset_int_vec(InputTransformed), _input_transformed, tensors, true);
- CpuAuxTensorHandler output_transformed(offset_int_vec(OutputTransformed), _output_transformed, tensors, true);
- ITensorPack gemm_pack = tensors;
- gemm_pack.add_const_tensor(ACL_SRC_0, input_transformed.get());
+ ITensorPack gemm_pack = tensors;
gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get());
_gemm_function->prepare(gemm_pack);
diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h
index 0d1c927b5a..ae1cffb659 100644
--- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/runtime/cpu/utils/CpuAuxTensorHandler.h
@@ -28,6 +28,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/Tensor.h"
+#include "src/common/utils/Log.h"
#include "support/Cast.h"
namespace arm_compute
@@ -38,7 +39,7 @@ namespace cpu
class CpuAuxTensorHandler
{
public:
- CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false)
+ CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
: _tensor()
{
if(info.total_size() == 0)
@@ -50,7 +51,12 @@ public:
ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
{
- _tensor.allocator()->allocate();
+ if(!bypass_alloc)
+ {
+ _tensor.allocator()->allocate();
+ ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
+ }
+
if(pack_inject)
{
pack.add_tensor(slot_id, &_tensor);