From 7179837ac9ef82c6f784aaee999be347bc7764fd Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 17 Apr 2019 13:01:54 +0100 Subject: COMPMID-2112: Rework memory manager support in NEWinogradLayer Change-Id: I3b86de829f70963f25564a6934926d0a13062763 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/1006 Tested-by: Arm Jenkins Reviewed-by: Pablo Marquez Comments-Addressed: Arm Jenkins --- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 94 ++++++++++------------ 1 file changed, 44 insertions(+), 50 deletions(-) diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index b4247be6cb..2332450c04 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -380,20 +380,17 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * // Kernel Storage const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) - * data_type_size - + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + * data_type_size; // Input storage const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) - * data_type_size - + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + * data_type_size; // Output storage const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) - * data_type_size - + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */ + * data_type_size; ; const KernelShape kernel_shape({ out_channels, static_cast(kernel_size.height), static_cast(kernel_size.width), in_channels }); const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape); @@ -446,62 +443,58 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * 1, _output->info()->data_type()); _output_nhwc.allocator()->init(info); - // Configure the InputTransform - _memory_group.manage(&_input_transformed); - _memory_group.manage(&_output_transformed); - _memory_group.manage(&_input_workspace); - _memory_group.manage(&_output_workspace); + const ITensor *input_to_use = _input; + ITensor *output_to_use = _output; + PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); + const unsigned int max_num_threads = NEScheduler::get().num_threads(); + // Configure the kernel to transform the input tensor from NCHW -> NHWC if(data_layout == DataLayout::NCHW) { - // configure the kernel to transform the input tensor from NCHW -> NHWC + _memory_group.manage(&_input_nhwc); _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - _input_nhwc.allocator()->allocate(); - transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - &_input_transformed, input_matrix_stride, &_input_workspace); - - // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] - _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U)); - - transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); - - //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method - _memory_group.manage(&_output_nhwc); - transform_output_kernel->configure(biases, &_output_transformed, - output_matrix_stride, &_output_nhwc, - in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace); + input_to_use = &_input_nhwc; + weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); } - else - { - transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - &_input_transformed, input_matrix_stride, &_input_workspace); - - // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] - _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U)); - - transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); - - transform_output_kernel->configure(biases, &_output_transformed, - output_matrix_stride, _output, - in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace); - } - - //Configure input/output workspaces, get_working_space_size() must be called after configure() - const unsigned int max_num_threads = NEScheduler::get().num_threads_hint(); - const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); - const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); - TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type()); + // Configure input transform kernel + _memory_group.manage(&_input_transformed); + _memory_group.manage(&_input_workspace); + transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, + &_input_transformed, input_matrix_stride, &_input_workspace); + const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); + TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type()); _input_workspace.allocator()->init(input_workspace_info); + _input_workspace.allocator()->allocate(); + if(data_layout == DataLayout::NCHW) + { + _input_nhwc.allocator()->allocate(); + } - TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type()); - _output_workspace.allocator()->init(output_workspace_info); + // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] + _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector); + transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); + // Configure GEMM function + _memory_group.manage(&_output_transformed); _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f); _input_transformed.allocator()->allocate(); - _output_transformed.allocator()->allocate(); - _input_workspace.allocator()->allocate(); + + // Configure output transform function + // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method + if(data_layout == DataLayout::NCHW) + { + _memory_group.manage(&_output_nhwc); + output_to_use = &_output_nhwc; + } + transform_output_kernel->configure(biases, &_output_transformed, + output_matrix_stride, output_to_use, + in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace); + const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); + TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type()); + _output_workspace.allocator()->init(output_workspace_info); _output_workspace.allocator()->allocate(); + _output_transformed.allocator()->allocate(); // Reorder the convoluted output to ACL's ordering NCHW if(data_layout == DataLayout::NCHW) @@ -541,6 +534,7 @@ void NEWinogradConvolutionLayer::run() //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs _gemm_function.run(); + // Transform output tensor to the spatial domain NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX); -- cgit v1.2.1