From e043767d068da389308507011d944e6db9e4d676 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 2 May 2018 14:07:55 +0100 Subject: COMPMID-920: Introduce prepare() stage Change-Id: I08ddb7f6e061178e7566518b48e4e18f8f078596 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129825 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/runtime/CL/functions/CLConvolutionLayer.cpp | 6 +++ src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 46 +++++++++++-------- src/runtime/CL/functions/CLGEMM.cpp | 39 +++++++++------- .../CL/functions/CLGEMMConvolutionLayer.cpp | 52 ++++++++++++---------- .../CL/functions/CLWinogradConvolutionLayer.cpp | 40 ++++++++++------- 5 files changed, 111 insertions(+), 72 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index 83281e1747..3d4fb113b2 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -135,5 +135,11 @@ ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo * void CLConvolutionLayer::run() { + prepare(); _function->run(); } + +void CLConvolutionLayer::prepare() +{ + _function->prepare(); +} diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 9b3bf48bca..151fa1b5fa 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -220,13 +220,6 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset); _gemmlowp_output.allocator()->allocate(); } - - // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called - if(!_are_weights_reshaped) - { - // Allocate the tensor for the weights reshaped - _reshape_weights_output.allocator()->allocate(); - } } Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped) @@ -311,17 +304,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void CLFullyConnectedLayer::run() { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _are_weights_reshaped = true; - _reshape_weights_kernel.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -356,3 +339,30 @@ void CLFullyConnectedLayer::run() _memory_group.release(); } + +void CLFullyConnectedLayer::prepare() +{ + // Reshape of the weights (happens only once) + if(!_are_weights_reshaped) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_kernel.run(); + _original_weights->mark_as_unused(); + + // Prepare GEMM prepare and release unused weights + if(!_is_quantized) + { + _mm_gemm.prepare(); + if(!_reshape_weights_output.is_used()) + { + _reshape_weights_output.allocator()->free(); + } + } + + CLScheduler::get().queue().finish(); + _are_weights_reshaped = true; + } +} diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 37fa0c5ba2..e735adba39 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -98,7 +98,7 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLT CLGEMM::CLGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false), - _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) + _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -114,6 +114,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = false; const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; @@ -169,7 +170,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * { // Allocate intermediate tensors _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } // Configure matrix addition kernel @@ -188,6 +192,8 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTen void CLGEMM::run() { + prepare(); + _memory_group.acquire(); if(_is_interleaved_transposed) @@ -195,18 +201,7 @@ void CLGEMM::run() // Run interleave kernel CLScheduler::get().enqueue(_interleave_kernel, false); - if(_is_first_run) - { - // Run transpose kernel - CLScheduler::get().enqueue(_transpose_kernel, false); - - // Mark original b matrix as unused - if(_reshape_b_only_on_first_run) - { - _original_b->mark_as_unused(); - } - } - else if(!_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run transpose kernel CLScheduler::get().enqueue(_transpose_kernel, false); @@ -223,6 +218,20 @@ void CLGEMM::run() } _memory_group.release(); +} - _is_first_run = false; +void CLGEMM::prepare() +{ + if(!_is_prepared) + { + if(_is_interleaved_transposed && _reshape_b_only_on_first_run) + { + // Run transpose kernel + _tmp_b.allocator()->allocate(); + CLScheduler::get().enqueue(_transpose_kernel, false); + _original_b->mark_as_unused(); + } + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index cf8a6a8a78..610eec4d67 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -91,7 +91,7 @@ void CLConvolutionLayerReshapeWeights::run() CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(), - _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true), _is_activationlayer_enabled(false) + _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false) { } @@ -165,7 +165,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * dilation, act_info)); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); @@ -258,9 +258,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one"); - // Allocate intermediate tensor - _weights_reshaped.allocator()->allocate(); - //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); @@ -305,7 +302,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI unsigned int mat_weights_cols = weights->dimension(3); unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element; - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized? nullptr:biases, nullptr)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr)); // Create tensor info for im2col reshaped inputs const unsigned int mat_input_cols = mat_weights_rows; @@ -369,16 +366,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI void CLGEMMConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _reshape_weights.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -398,13 +386,6 @@ void CLGEMMConvolutionLayer::run() { // Run gemm _mm_gemm.run(); - - // Release reshaped weights if marked unused by CLGEMM - if(_is_first_run && !_weights_reshaped.is_used()) - { - CLScheduler::get().queue().finish(); - _weights_reshaped.allocator()->free(); - } } // Reshape output matrix @@ -417,6 +398,29 @@ void CLGEMMConvolutionLayer::run() } _memory_group.release(); +} - _is_first_run = false; +void CLGEMMConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Run weights reshaping and mark as unused + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + _original_weights->mark_as_unused(); + + // Run GEMM prepare + if(!_is_quantized) + { + _mm_gemm.prepare(); + if(!_weights_reshaped.is_used()) + { + _weights_reshaped.allocator()->free(); + } + } + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 5ff4fbceee..025a16b4fb 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -69,7 +69,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(), - _is_first_run(true), _is_activationlayer_enabled(false) + _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false) { } @@ -97,6 +97,9 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we conv_info, input->info()->data_layout()); + _is_prepared = false; + _original_weights = weights; + // Manage intermediate tensors _memory_group.manage(&_input0); _memory_group.manage(&_batched_mm_output); @@ -124,7 +127,6 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we // Allocate temporary tensors _input0.allocator()->allocate(); - _input1.allocator()->allocate(); _batched_mm_output.allocator()->allocate(); } @@ -182,11 +184,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen void CLWinogradConvolutionLayer::run() { - if(_is_first_run) - { - // Run filter transform - CLScheduler::get().enqueue(_filter_transform, false); - } + prepare(); _memory_group.acquire(); @@ -196,13 +194,6 @@ void CLWinogradConvolutionLayer::run() // Run batched matrix multiplication _batched_mm.run(); - // Release reshaped weights if marked unused by CLGEMM - if(_is_first_run && !_input1.is_used()) - { - CLScheduler::get().queue().finish(); - _input1.allocator()->free(); - } - // Run output transform CLScheduler::get().enqueue(_output_transform); @@ -212,6 +203,25 @@ void CLWinogradConvolutionLayer::run() } _memory_group.release(); +} + +void CLWinogradConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Run filter transform and mark original weights as unused + _input1.allocator()->allocate(); + CLScheduler::get().enqueue(_filter_transform, false); + _original_weights->mark_as_unused(); + + // Prepare GEMM and release reshaped weights if marked unused by CLGEMM + _batched_mm.prepare(); + if(!_input1.is_used()) + { + _input1.allocator()->free(); + } - _is_first_run = false; + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } -- cgit v1.2.1