From 7179837ac9ef82c6f784aaee999be347bc7764fd Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 17 Apr 2019 13:01:54 +0100
Subject: COMPMID-2112: Rework memory manager support in NEWinogradLayer

Change-Id: I3b86de829f70963f25564a6934926d0a13062763
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1006
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEON/functions/NEWinogradConvolutionLayer.cpp  | 94 ++++++++++------------
 1 file changed, 44 insertions(+), 50 deletions(-)
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index b4247be6cb..2332450c04 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -380,20 +380,17 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
     // Kernel Storage
     const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
                                                                                          in_channels)
-                                       * data_type_size
-                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                       * data_type_size;
 
     // Input storage
     const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
                                                                                      use_same_padding)
-                                      * data_type_size
-                                      + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                      * data_type_size;
 
     // Output storage
     const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
                                                                                         use_same_padding)
-                                       * data_type_size
-                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
+                                       * data_type_size;
     ;
     const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
     const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -446,62 +443,58 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
                     1, _output->info()->data_type());
     _output_nhwc.allocator()->init(info);
 
-    // Configure the InputTransform
-    _memory_group.manage(&_input_transformed);
-    _memory_group.manage(&_output_transformed);
-    _memory_group.manage(&_input_workspace);
-    _memory_group.manage(&_output_workspace);
+    const ITensor     *input_to_use  = _input;
+    ITensor           *output_to_use = _output;
+    PermutationVector  weights_permutation_vector(3U, 0U, 1U, 2U);
+    const unsigned int max_num_threads = NEScheduler::get().num_threads();
 
+    // Configure the kernel to transform the input tensor from NCHW -> NHWC
     if(data_layout == DataLayout::NCHW)
     {
-        // configure the kernel to transform the input tensor from NCHW -> NHWC
+        _memory_group.manage(&_input_nhwc);
         _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-        _input_nhwc.allocator()->allocate();
-        transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_transformed, input_matrix_stride, &_input_workspace);
-
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-        //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-        _memory_group.manage(&_output_nhwc);
-        transform_output_kernel->configure(biases, &_output_transformed,
-                                           output_matrix_stride, &_output_nhwc,
-                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+        input_to_use               = &_input_nhwc;
+        weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
     }
-    else
-    {
-        transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_transformed, input_matrix_stride, &_input_workspace);
-
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
-
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-        transform_output_kernel->configure(biases, &_output_transformed,
-                                           output_matrix_stride, _output,
-                                           in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
-    }
-
-    //Configure input/output workspaces, get_working_space_size() must be called after configure()
-    const unsigned int max_num_threads       = NEScheduler::get().num_threads_hint();
-    const size_t       input_workspace_size  = transform_input_kernel->get_working_space_size(max_num_threads);
-    const size_t       output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
 
-    TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
+    // Configure input transform kernel
+    _memory_group.manage(&_input_transformed);
+    _memory_group.manage(&_input_workspace);
+    transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                      &_input_transformed, input_matrix_stride, &_input_workspace);
+    const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
+    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
     _input_workspace.allocator()->init(input_workspace_info);
+    _input_workspace.allocator()->allocate();
+    if(data_layout == DataLayout::NCHW)
+    {
+        _input_nhwc.allocator()->allocate();
+    }
 
-    TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
-    _output_workspace.allocator()->init(output_workspace_info);
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
+    transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
 
+    // Configure GEMM function
+    _memory_group.manage(&_output_transformed);
     _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
     _input_transformed.allocator()->allocate();
-    _output_transformed.allocator()->allocate();
-    _input_workspace.allocator()->allocate();
+
+    // Configure output transform function
+    // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+    if(data_layout == DataLayout::NCHW)
+    {
+        _memory_group.manage(&_output_nhwc);
+        output_to_use = &_output_nhwc;
+    }
+    transform_output_kernel->configure(biases, &_output_transformed,
+                                       output_matrix_stride, output_to_use,
+                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels, &_output_workspace);
+    const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
+    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
+    _output_workspace.allocator()->init(output_workspace_info);
     _output_workspace.allocator()->allocate();
+    _output_transformed.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
     if(data_layout == DataLayout::NCHW)
@@ -541,6 +534,7 @@ void NEWinogradConvolutionLayer::run()
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
     _gemm_function.run();
+
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
 
-- 
cgit v1.2.1