COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES)

Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2018-06-05 14:56:06 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:53:09 +0000
commit: 72219330fd85b1271e714d4ba894d6d8e26340c9 (patch)
tree: 9ae0510087a1ca77b1695252a8621de3f2ab98af /src/runtime/CL
parent: c42f28d45e9b990276d54880d2cee9c9ee675a41 (diff)
download: ComputeLibrary-72219330fd85b1271e714d4ba894d6d8e26340c9.tar.gz
8 files changed, 155 insertions, 70 deletions
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 6c54b18b81..4c1ea5b9a2 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
     : _memory_group(std::move(memory_manager)),
       _scale_f(),
       _conv_f(),
-      _scaled_output()
+      _scaled_output(),
+      _is_prepared(false)
 {
 }
 
@@ -104,6 +105,8 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights,
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
+    _is_prepared = false;
+
     _memory_group.manage(&_scaled_output);
 
     // configure scale function
@@ -126,8 +129,21 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights,
 
 void CLDeconvolutionLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     _scale_f.run();
     _conv_f.run();
+
     _memory_group.release();
 }
+
+void CLDeconvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _conv_f.prepare();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index c2b24e3c20..1815361a72 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -91,7 +91,7 @@ void CLDepthwiseConvolutionLayer3x3::run()
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
@@ -104,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
@@ -182,7 +182,6 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
-    _weights_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
 }
 
@@ -235,18 +234,7 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
 
 void CLDepthwiseConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-        _is_first_run = false;
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     CLScheduler::get().enqueue(_im2col_kernel);
     CLScheduler::get().enqueue(_v2mm_input_fill_border);
@@ -257,3 +245,20 @@ void CLDepthwiseConvolutionLayer::run()
         CLScheduler::get().enqueue(_output_stage_kernel);
     }
 }
+
+void CLDepthwiseConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+        _original_weights->mark_as_unused();
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
index af2c6f0eb8..fa2c3affa3 100644
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,14 @@ void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICL
 
 void CLDepthwiseSeparableConvolutionLayer::run()
 {
+    prepare();
+
     _depthwise_conv.run();
     _pointwise_conv.run();
+}
+
+void CLDepthwiseSeparableConvolutionLayer::prepare()
+{
+    _depthwise_conv.prepare();
+    _pointwise_conv.prepare();
 }
 \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index f9713bb586..bb76872700 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -84,12 +84,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
 
-    // Store original b matrix
-    _original_b = b;
-
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _is_prepared                 = false;
+    _original_b                  = b;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -262,7 +260,7 @@ void CLGEMM::prepare()
     {
         if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
         {
-            // Run transpose kernel
+            // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
             CLScheduler::get().enqueue(_transpose_kernel, false);
             _original_b->mark_as_unused();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 27bed44098..82710b6461 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,8 +91,7 @@ void CLConvolutionLayerReshapeWeights::run()
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false),
-      _retain_internal_weights(false)
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -166,10 +165,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
                                                                 dilation,
                                                                 act_info));
 
-    _is_prepared             = false;
-    _original_weights        = weights;
-    _is_quantized            = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _retain_internal_weights = weights_info.retain_internal_weights();
+    _is_prepared      = weights_info.retain_internal_weights();
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const DataType dt = input->info()->data_type();
 
@@ -408,23 +406,18 @@ void CLGEMMConvolutionLayer::prepare()
 {
     if(!_is_prepared)
     {
-        if(!_retain_internal_weights)
-        {
-            // Run weights reshaping and mark as unused
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-            _weights_reshaped.allocator()->allocate();
-            _reshape_weights.run();
-            _original_weights->mark_as_unused();
-        }
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
 
-        // Run GEMM prepare
-        if(!_is_quantized)
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
+        if(!_weights_reshaped.is_used())
         {
-            _mm_gemm.prepare();
-            if(!_weights_reshaped.is_used() && !_retain_internal_weights)
-            {
-                _weights_reshaped.allocator()->free();
-            }
+            _weights_reshaped.allocator()->free();
         }
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 711b006ede..94dc0e071c 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -59,8 +59,23 @@ inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_o
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
-      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)),
+      _mm_kernel(),
+      _mtx_a_reshape_kernel(),
+      _mtx_b_reshape_kernel(),
+      _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(),
+      _vector_sum_col(),
+      _vector_sum_row(),
+      _tmp_a(),
+      _tmp_b(),
+      _original_b(nullptr),
+      _a_offset(0),
+      _b_offset(0),
+      _is_interleaved_transposed(true),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false)
 {
 }
 
@@ -70,6 +85,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
     ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
+    _is_prepared                 = false;
+    _original_b                  = b;
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->info()->quantization_info().offset;
     _b_offset                    = b->info()->quantization_info().offset;
@@ -149,10 +166,13 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
     if(_is_interleaved_transposed)
     {
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
-    if(_a_offset != 0)
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         _vector_sum_col.allocator()->allocate();
     }
@@ -234,6 +254,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
 
 void CLGEMMLowpMatrixMultiplyCore::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -241,21 +263,17 @@ void CLGEMMLowpMatrixMultiplyCore::run()
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
 
-        if(_is_first_run || !_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
             CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
         }
     }
 
-    // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
-    if(_is_first_run || !_reshape_b_only_on_first_run)
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
-        {
-            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
-        }
+        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
     }
 
     // Run matrix multiply
@@ -271,6 +289,30 @@ void CLGEMMLowpMatrixMultiplyCore::run()
     CLScheduler::get().enqueue(_offset_contribution_kernel, true);
 
     _memory_group.release();
+}
+
+void CLGEMMLowpMatrixMultiplyCore::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+            // Run reshape kernel and mark original weights tensor as unused
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            _original_b->mark_as_unused();
+        }
 
-    _is_first_run = false;
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            _vector_sum_col.allocator()->allocate();
+            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 31d5cd5a7e..d15e5dfa3d 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
 
 CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false), _original_weights(nullptr)
+      _is_prepared(false), _original_weights(nullptr)
 {
 }
 
@@ -128,7 +128,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
 
     bool _has_bias    = (biases != nullptr);
     _original_weights = weights;
-    _is_first_run     = true;
+    _is_prepared      = false;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -160,7 +160,6 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
 
     // Allocate intermediate tensors
-    _weights_reshaped.allocator()->allocate();
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
@@ -169,17 +168,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
 
 void CLLocallyConnectedLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _is_first_run = false;
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -194,3 +183,19 @@ void CLLocallyConnectedLayer::run()
 
     _memory_group.release();
 }
+
+void CLLocallyConnectedLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights reshaping and mark original weights tensor as unused
+        _weights_reshaped.allocator()->allocate();
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        _original_weights->mark_as_unused();
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 4843ba6364..0e1b9d5b58 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -36,7 +36,8 @@ using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+      _is_prepared(false)
 {
 }
 
@@ -74,6 +75,8 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
     TensorShape shape      = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
 
+    _is_prepared = false;
+
     _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
 
@@ -100,7 +103,10 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con
 
 void CLRNNLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
+
     _fully_connected_kernel.run();
     _gemm_state_f.run();
     CLScheduler::get().enqueue(_add_kernel);
@@ -108,5 +114,17 @@ void CLRNNLayer::run()
 
     // copy hidden out to output
     CLScheduler::get().enqueue(_copy_kernel);
+
     _memory_group.release();
+}
+
+void CLRNNLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        _fully_connected_kernel.prepare();
+        _gemm_state_f.prepare();
+
+        _is_prepared = true;
+    }
 }
 \ No newline at end of file
author	Georgios Pinitas <georgios.pinitas@arm.com>	2018-06-05 14:56:06 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:53:09 +0000
commit	72219330fd85b1271e714d4ba894d6d8e26340c9 (patch)
tree	9ae0510087a1ca77b1695252a8621de3f2ab98af /src/runtime/CL
parent	c42f28d45e9b990276d54880d2cee9c9ee675a41 (diff)
download	ComputeLibrary-72219330fd85b1271e714d4ba894d6d8e26340c9.tar.gz