diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-08-20 17:26:45 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2021-08-24 09:00:23 +0000 |
commit | 87a74effff65f6fa1b0e565818e02c3b414ae1cf (patch) | |
tree | 0c5d63bbbcc285232959a5a4134f282a980ab4bf /src | |
parent | 511771fbe0a74e6d9dfd37ba9b4926a8315ec7aa (diff) | |
download | ComputeLibrary-87a74effff65f6fa1b0e565818e02c3b414ae1cf.tar.gz |
Re-use auxiliary memory withing CpuWinogradConv2d operators
Input/Output transformation operations are independent and done in
different time-steps of the algorithm, this memory can be re-used
between this transformation stages.
Moreover, reduce the allocation when extracting workspace sizes for
Winograd trasformations. There is a mix return of sizes in bytes and
elements, thus ensure the correct is in place. storage_size() member
functions return elements while working_space() function bytes.
Resolves: COMPMID-4781
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I705445ba7ca818cead48369db3cacd49684c7192
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6145
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp | 10 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuWinogradConv2d.cpp | 45 | ||||
-rw-r--r-- | src/runtime/cpu/operators/CpuWinogradConv2d.h | 27 |
3 files changed, 43 insertions, 39 deletions
diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp index 5620d36e2c..9456f96354 100644 --- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp +++ b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp @@ -194,8 +194,8 @@ template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, in unsigned int CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int num_output_channels, int num_input_channels) const { const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels); - return static_cast<unsigned int>( - WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels)); + // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T + return static_cast<unsigned int>(WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T)); } template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> @@ -297,7 +297,8 @@ unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTile // Construct shapes for the input and kernel tensors. const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels); const KernelShape kern_shape(1, KernelRows, KernelCols, num_channels); - return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding)); + // Return the size, converted into units of TIn + return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T)); } template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> @@ -432,8 +433,9 @@ unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTil // Construct shapes for the input and kernel tensors. const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1); const KernelShape kern_shape(num_output_channels, KernelRows, KernelCols, 1); + // Return the size, converted into units of TOut return static_cast<unsigned int>( - WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels)); + WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T)); } template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp index ca7b004f3f..253280a951 100644 --- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp @@ -494,14 +494,10 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei constexpr size_t storage_alignment = 64; // Kernel Storage - const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, - in_channels) - * data_type_size; + const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; // Input storage - const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, - use_same_padding) - * data_type_size; + const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size; // Output storage const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size; @@ -558,7 +554,6 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei if(_data_layout == DataLayout::NCHW) { _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - _aux_mem[PermutedInput] = MemoryInfo(offset_int_vec(PermutedInput), MemoryLifetime::Temporary, src->total_size()); input_to_use = &_input_nhwc; weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); } @@ -609,7 +604,6 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei if(_data_layout == DataLayout::NCHW) { _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); - _aux_mem[PermutedOutput] = MemoryInfo(offset_int_vec(PermutedOutput), MemoryLifetime::Temporary, dst->total_size()); } _transform_input_kernel = std::move(transform_input_kernel); @@ -630,12 +624,17 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS]; _aux_mem[TempResult] = asm_mem_req[TempResult]; - _aux_mem[InputTransformed] = MemoryInfo(offset_int_vec(InputTransformed), MemoryLifetime::Temporary, input_storage_size, storage_alignment); - _aux_mem[InputWorkspace] = MemoryInfo(offset_int_vec(InputWorkspace), MemoryLifetime::Temporary, input_workspace_size); + // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. + _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, input_storage_size, storage_alignment); + _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, output_storage_size, storage_alignment); + _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size)); _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - _aux_mem[WeightsTransformed] = MemoryInfo(offset_int_vec(WeightsTransformed), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment); - _aux_mem[OutputTransformed] = MemoryInfo(offset_int_vec(OutputTransformed), MemoryLifetime::Temporary, output_storage_size, storage_alignment); - _aux_mem[OutputWorkspace] = MemoryInfo(offset_int_vec(OutputWorkspace), MemoryLifetime::Temporary, output_workspace_size); + _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment); + if(_data_layout == DataLayout::NCHW) + { + _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); + _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); + } } Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, @@ -757,9 +756,8 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) auto d = tensors.get_tensor(ACL_DST); CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); - CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); - CpuAuxTensorHandler input_transformed(offset_int_vec(InputTransformed), _input_transformed, tensors, true); - CpuAuxTensorHandler input_workspace(offset_int_vec(InputWorkspace), _input_workspace, tensors, true); + CpuAuxTensorHandler input_transformed(offset_int_vec(TransformedInput), _input_transformed, tensors, true); + CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); const bool is_nchw = _data_layout == DataLayout::NCHW; if(is_nchw) @@ -773,15 +771,20 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : a }, { ACL_DST, input_transformed.get() }, { ACL_INT, input_workspace.get() } }; NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, _transform_input_kernel->window(), transform_input_pack); - CpuAuxTensorHandler output_transformed(offset_int_vec(OutputTransformed), _output_transformed, tensors, true); - CpuAuxTensorHandler weights_transformed(offset_int_vec(WeightsTransformed), _kernel_storage, tensors, true); + CpuAuxTensorHandler output_transformed(offset_int_vec(TransformedOutput), _output_transformed, tensors, true); + CpuAuxTensorHandler weights_transformed(offset_int_vec(TransformedWeights), _kernel_storage, tensors, true); // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs - ITensorPack gemm_pack{ { ACL_SRC, input_transformed.get() }, { ACL_SRC_1, weights_transformed.get() }, { ACL_DST, output_transformed.get() } }; + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC, input_transformed.get()); + gemm_pack.add_const_tensor(ACL_SRC_1, weights_transformed.get()); + gemm_pack.add_const_tensor(ACL_BIAS, nullptr); + gemm_pack.add_tensor(ACL_DST, output_transformed.get()); _gemm_function->run(gemm_pack); // Transform output tensor to the spatial domain - CpuAuxTensorHandler output_workspace(offset_int_vec(OutputWorkspace), _output_workspace, tensors, true); + CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); + CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); ITensorPack transform_output_pack{ { ACL_SRC_0, c }, { ACL_SRC_1, output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : d }, { ACL_INT, output_workspace.get() } }; NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, _transform_output_kernel->window(), transform_output_pack); @@ -813,7 +816,7 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors) _permute_weights->run(permute_tensors); // Transform weights - ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(WeightsTransformed))); + ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); CpuAuxTensorHandler transformed_weights(_kernel_storage, *weights_transf); diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.h b/src/runtime/cpu/operators/CpuWinogradConv2d.h index ae705ac86b..b5b9c3f2e3 100644 --- a/src/runtime/cpu/operators/CpuWinogradConv2d.h +++ b/src/runtime/cpu/operators/CpuWinogradConv2d.h @@ -93,20 +93,19 @@ public: private: enum AuxTensorIdx { - GemmWorkspace = 0, - Pretranspose, - InterleavedLHS, - TransposedRHS, - TempResult, - PermutedInput, - InputTransformed, - InputWorkspace, - PermutedOutput, - PermutedWeights, - WeightsTransformed, - OutputTransformed, - OutputWorkspace, - Count + GemmWorkspace = 0, + Pretranspose = 1, + InterleavedLHS = 2, + TransposedRHS = 3, + TempResult = 4, + TransformedInput = 5, + TransformedOutput = 6, + WorkspaceIO = 7, + TransformedWeights = 8, + PermutedWeights = 9, + PermutedInput = TransformedOutput, + PermutedOutput = TransformedInput, + Count = 10 }; std::unique_ptr<CpuGemm> _gemm_function; |