From 1562be3e8a449360a90af75f6f1481a30d41be75 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 8 Mar 2018 19:09:19 +0000
Subject: COMPMID-998: Release unused trainable parameters.

Change-Id: I361a520f34080016a25bc86e1e6789777c5152c1
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124432
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   | 23 +++++++++++++++-------
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |  8 +++++++-
 .../CL/functions/CLGEMMConvolutionLayer.cpp        | 14 +++++++++----
 .../CL/functions/CLLocallyConnectedLayer.cpp       | 12 ++++++++---
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 21 ++++++++++++++++----
 .../NEON/functions/NEFullyConnectedLayer.cpp       |  8 +++++++-
 .../NEON/functions/NEGEMMConvolutionLayer.cpp      |  8 +++++++-
 .../NEON/functions/NELocallyConnectedLayer.cpp     | 12 ++++++++---
 8 files changed, 82 insertions(+), 24 deletions(-)

(limited to 'src/runtime')
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 112af60f35..8d7c92bdf1 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -64,7 +64,7 @@ void CLDepthwiseConvolutionLayer3x3::run()
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -78,7 +78,9 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     bool            append_bias = (biases != nullptr) && !_is_quantized;
     const GPUTarget gpu_target  = CLScheduler::get().target();
@@ -154,16 +156,23 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
 
 void CLDepthwiseConvolutionLayer::run()
 {
-    CLScheduler::get().enqueue(_im2col_kernel);
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
 
+    CLScheduler::get().enqueue(_im2col_kernel);
     CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
     CLScheduler::get().enqueue(_v2mm_kernel);
-
     CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-
     if(_is_quantized)
     {
         CLScheduler::get().enqueue(_output_stage_kernel);
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 2b4670b98c..676706fb17 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -76,7 +76,7 @@ Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
-      _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
+      _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -152,6 +152,7 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
     _is_fc_after_conv     = true;
     _accumulate_biases    = false;
     _is_quantized         = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _original_weights     = weights;
 
     // Configure gemmlowp output
     if(_is_quantized)
@@ -316,8 +317,13 @@ void CLFullyConnectedLayer::run()
     // Reshape of the weights (happens only once)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights_kernel.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index e7ad62f5ff..f43e100565 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -90,8 +90,8 @@ void CLConvolutionLayerReshapeWeights::run()
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(),
-      _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _original_weights(nullptr),
+      _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
 {
 }
 
@@ -164,7 +164,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
                                                                 weights_info,
                                                                 dilation));
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const DataType dt = input->info()->data_type();
 
@@ -349,9 +351,13 @@ void CLGEMMConvolutionLayer::run()
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
-        _reshape_weights.run();
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
+        _reshape_weights.run();
         _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index a3eb5010bd..986fe00973 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
 
 CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false)
+      _is_first_run(false), _original_weights(nullptr)
 {
 }
 
@@ -126,8 +126,9 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
 
-    bool _has_bias = (biases != nullptr);
-    _is_first_run  = true;
+    bool _has_bias    = (biases != nullptr);
+    _original_weights = weights;
+    _is_first_run     = true;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -169,8 +170,13 @@ void CLLocallyConnectedLayer::run()
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _is_first_run = false;
         CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 95fcf8805e..f28ed715f6 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -153,7 +153,7 @@ void NEDepthwiseConvolutionLayer3x3::run()
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -167,7 +167,9 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
 
     // Should bias be appended ?
     bool append_bias = (biases != nullptr) && !_is_quantized;
@@ -241,10 +243,21 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
 
 void NEDepthwiseConvolutionLayer::run()
 {
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
+
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
     NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
     if(_is_quantized)
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 26b7271710..b310ad35e3 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -132,7 +132,7 @@ void NEFullyConnectedLayerReshapeWeights::run()
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
-      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
+      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
 {
 }
 
@@ -163,6 +163,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
     const int    num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
     const size_t linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
+    _original_weights     = weights;
     _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
     _are_weights_reshaped = are_weights_reshaped;
     _accumulate_biases    = biases != nullptr;
@@ -324,8 +325,13 @@ void NEFullyConnectedLayer::run()
     // Reshape of the weights (happens only once)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights_kernel.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index d9707d95e0..b2dd0227a5 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -217,7 +217,7 @@ Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInf
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
-      _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
+      _output_col2im_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
       _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
 {
 }
@@ -267,6 +267,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
 
     ARM_COMPUTE_ERROR_THROW_ON(status);
 
+    _original_weights                       = weights;
     const unsigned int fixed_point_position = input->info()->fixed_point_position();
     const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
 
@@ -549,8 +550,13 @@ void NEGEMMConvolutionLayer::run()
     // Run weights reshaping (Runs once for every configure)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 973559441f..913acf86a2 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
 
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false)
+      _is_first_run(false), _original_weights(nullptr)
 {
 }
 
@@ -126,8 +126,9 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
 
-    bool _has_bias = (biases != nullptr);
-    _is_first_run  = true;
+    bool _has_bias    = (biases != nullptr);
+    _is_first_run     = true;
+    _original_weights = weights;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -169,8 +170,13 @@ void NELocallyConnectedLayer::run()
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _is_first_run = false;
         NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
-- 
cgit v1.2.1