From 72219330fd85b1271e714d4ba894d6d8e26340c9 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 5 Jun 2018 14:56:06 +0100 Subject: COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES) Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255 Reviewed-by: Anthony Barbier Tested-by: Jenkins Reviewed-by: Pablo Tello Reviewed-by: Michele DiGiorgio --- src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 18 ++++- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 35 +++++---- .../CLDepthwiseSeparableConvolutionLayer.cpp | 10 ++- src/runtime/CL/functions/CLGEMM.cpp | 6 +- .../CL/functions/CLGEMMConvolutionLayer.cpp | 35 ++++----- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 68 +++++++++++++---- .../CL/functions/CLLocallyConnectedLayer.cpp | 33 ++++---- src/runtime/CL/functions/CLRNNLayer.cpp | 20 ++++- .../GLES_COMPUTE/functions/GCConvolutionLayer.cpp | 42 ++++++----- .../functions/GCFullyConnectedLayer.cpp | 35 +++++---- src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp | 45 ++++++++--- src/runtime/NEON/functions/NEConvolutionLayer.cpp | 6 ++ .../NEON/functions/NEDeconvolutionLayer.cpp | 18 ++++- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 34 +++++---- .../NEDepthwiseSeparableConvolutionLayer.cpp | 10 ++- .../NEON/functions/NEFullyConnectedLayer.cpp | 54 ++++++------- src/runtime/NEON/functions/NEGEMM.cpp | 48 +++++++++--- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 65 ++++++++++------ .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 88 +++++++++++++++++----- .../NEON/functions/NELocallyConnectedLayer.cpp | 32 ++++---- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 40 +++++++--- 21 files changed, 503 insertions(+), 239 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 6c54b18b81..4c1ea5b9a2 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -38,7 +38,8 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr memor : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), - _scaled_output() + _scaled_output(), + _is_prepared(false) { } @@ -104,6 +105,8 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top)); + _is_prepared = false; + _memory_group.manage(&_scaled_output); // configure scale function @@ -126,8 +129,21 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, void CLDeconvolutionLayer::run() { + prepare(); + _memory_group.acquire(); + _scale_f.run(); _conv_f.run(); + _memory_group.release(); } + +void CLDeconvolutionLayer::prepare() +{ + if(!_is_prepared) + { + _conv_f.prepare(); + _is_prepared = true; + } +} diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index c2b24e3c20..1815361a72 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -91,7 +91,7 @@ void CLDepthwiseConvolutionLayer3x3::run() CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer() : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), - _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr) + _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr) { } @@ -104,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w const size_t weights_h = weights->info()->dimension(1); const size_t weights_z = weights->info()->dimension(2); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); @@ -182,7 +182,6 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w // Allocate intermediate tensors _input_reshaped.allocator()->allocate(); - _weights_reshaped.allocator()->allocate(); _v2mm_output.allocator()->allocate(); } @@ -235,18 +234,7 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe void CLDepthwiseConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - CLScheduler::get().enqueue(_weights_reshape_kernel); - CLScheduler::get().enqueue(_v2mm_weights_fill_border); - _is_first_run = false; - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); CLScheduler::get().enqueue(_im2col_kernel); CLScheduler::get().enqueue(_v2mm_input_fill_border); @@ -257,3 +245,20 @@ void CLDepthwiseConvolutionLayer::run() CLScheduler::get().enqueue(_output_stage_kernel); } } + +void CLDepthwiseConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + CLScheduler::get().enqueue(_weights_reshape_kernel); + CLScheduler::get().enqueue(_v2mm_weights_fill_border); + _original_weights->mark_as_unused(); + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp index af2c6f0eb8..fa2c3affa3 100644 --- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICL void CLDepthwiseSeparableConvolutionLayer::run() { + prepare(); + _depthwise_conv.run(); _pointwise_conv.run(); +} + +void CLDepthwiseSeparableConvolutionLayer::prepare() +{ + _depthwise_conv.prepare(); + _pointwise_conv.prepare(); } \ No newline at end of file diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index f9713bb586..bb76872700 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -84,12 +84,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info)); - // Store original b matrix - _original_b = b; - // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _is_prepared = false; + _original_b = b; const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; @@ -262,7 +260,7 @@ void CLGEMM::prepare() { if(_is_interleaved_transposed && _reshape_b_only_on_first_run) { - // Run transpose kernel + // Run transpose kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); CLScheduler::get().enqueue(_transpose_kernel, false); _original_b->mark_as_unused(); diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 27bed44098..82710b6461 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -91,8 +91,7 @@ void CLConvolutionLayerReshapeWeights::run() CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(), - _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false), - _retain_internal_weights(false) + _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false) { } @@ -166,10 +165,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * dilation, act_info)); - _is_prepared = false; - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _retain_internal_weights = weights_info.retain_internal_weights(); + _is_prepared = weights_info.retain_internal_weights(); + _original_weights = weights; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const DataType dt = input->info()->data_type(); @@ -408,23 +406,18 @@ void CLGEMMConvolutionLayer::prepare() { if(!_is_prepared) { - if(!_retain_internal_weights) - { - // Run weights reshaping and mark as unused - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - _original_weights->mark_as_unused(); - } + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + _original_weights->mark_as_unused(); - // Run GEMM prepare - if(!_is_quantized) + // Prepare GEMM + _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare(); + if(!_weights_reshaped.is_used()) { - _mm_gemm.prepare(); - if(!_weights_reshaped.is_used() && !_retain_internal_weights) - { - _weights_reshaped.allocator()->free(); - } + _weights_reshaped.allocator()->free(); } CLScheduler::get().queue().finish(); diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 711b006ede..94dc0e071c 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -59,8 +59,23 @@ inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_o } // namespace CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), - _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false) + : _memory_group(std::move(memory_manager)), + _mm_kernel(), + _mtx_a_reshape_kernel(), + _mtx_b_reshape_kernel(), + _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), + _offset_contribution_kernel(), + _vector_sum_col(), + _vector_sum_row(), + _tmp_a(), + _tmp_b(), + _original_b(nullptr), + _a_offset(0), + _b_offset(0), + _is_interleaved_transposed(true), + _reshape_b_only_on_first_run(false), + _is_prepared(false) { } @@ -70,6 +85,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor ARM_COMPUTE_UNUSED(gemm_info); ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info)); + _is_prepared = false; + _original_b = b; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->info()->quantization_info().offset; _b_offset = b->info()->quantization_info().offset; @@ -149,10 +166,13 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_interleaved_transposed) { _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } - if(_a_offset != 0) + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); } @@ -234,6 +254,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso void CLGEMMLowpMatrixMultiplyCore::run() { + prepare(); + _memory_group.acquire(); if(_is_interleaved_transposed) @@ -241,21 +263,17 @@ void CLGEMMLowpMatrixMultiplyCore::run() // Run reshape matrix A CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false); - if(_is_first_run || !_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run reshape matrix B CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); } } - // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once - if(_is_first_run || !_reshape_b_only_on_first_run) + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); - } + CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); } // Run matrix multiply @@ -271,6 +289,30 @@ void CLGEMMLowpMatrixMultiplyCore::run() CLScheduler::get().enqueue(_offset_contribution_kernel, true); _memory_group.release(); +} + +void CLGEMMLowpMatrixMultiplyCore::prepare() +{ + if(!_is_prepared) + { + if(_is_interleaved_transposed && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + _original_b->mark_as_unused(); + } - _is_first_run = false; + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); + } + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp index 31d5cd5a7e..d15e5dfa3d 100644 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp @@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_first_run(false), _original_weights(nullptr) + _is_prepared(false), _original_weights(nullptr) { } @@ -128,7 +128,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor bool _has_bias = (biases != nullptr); _original_weights = weights; - _is_first_run = true; + _is_prepared = false; const unsigned int kernel_width = weights->info()->dimension(0); const unsigned int kernel_height = weights->info()->dimension(1); @@ -160,7 +160,6 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h)); // Allocate intermediate tensors - _weights_reshaped.allocator()->allocate(); _input_im2col_reshaped.allocator()->allocate(); _gemm_output.allocator()->allocate(); @@ -169,17 +168,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor void CLLocallyConnectedLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _is_first_run = false; - CLScheduler::get().enqueue(_weights_reshape_kernel); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -194,3 +183,19 @@ void CLLocallyConnectedLayer::run() _memory_group.release(); } + +void CLLocallyConnectedLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + CLScheduler::get().enqueue(_weights_reshape_kernel); + _original_weights->mark_as_unused(); + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 4843ba6364..0e1b9d5b58 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -36,7 +36,8 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output() + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), + _is_prepared(false) { } @@ -74,6 +75,8 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + _is_prepared = false; + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); @@ -100,7 +103,10 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con void CLRNNLayer::run() { + prepare(); + _memory_group.acquire(); + _fully_connected_kernel.run(); _gemm_state_f.run(); CLScheduler::get().enqueue(_add_kernel); @@ -108,5 +114,17 @@ void CLRNNLayer::run() // copy hidden out to output CLScheduler::get().enqueue(_copy_kernel); + _memory_group.release(); +} + +void CLRNNLayer::prepare() +{ + if(!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } } \ No newline at end of file diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp index d1ef87d32c..67b2ae9d61 100644 --- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp +++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp @@ -37,7 +37,7 @@ using namespace arm_compute; GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights() - : _weights_reshape_kernel(), _weights_reshaped() + : _weights_reshape_kernel() { } @@ -68,7 +68,7 @@ void GCConvolutionLayerReshapeWeights::run() GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr), - _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false) + _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false) { } @@ -97,7 +97,7 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; if(biases != nullptr) @@ -184,9 +184,6 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one"); - // Allocate intermediate tensor - _weights_reshaped.allocator()->allocate(); - //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); @@ -200,17 +197,7 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig void GCConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _reshape_weights.run(); - _is_first_run = false; - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -221,17 +208,34 @@ void GCConvolutionLayer::run() // Run gemm on reshaped matrices _mm_gemm.run(); - GCScheduler::get().memory_barrier(); + // Reshape output matrix GCScheduler::get().dispatch(_output_col2im_kernel, false); + GCScheduler::get().memory_barrier(); _memory_group.release(); - GCScheduler::get().memory_barrier(); // Run Activation Layer if(_is_activationlayer_enabled) { _activationlayer_function.run(); } } + +void GCConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark as unused + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp index a300033bb2..ab2c6c2813 100644 --- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp +++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp @@ -40,7 +40,7 @@ void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCT GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), - _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false) + _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false) { } @@ -86,6 +86,7 @@ void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *w ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2); + _original_weights = weights; _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true; _is_fc_after_conv = true; _accumulate_biases = false; @@ -141,25 +142,13 @@ void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *w configure_fc_fc(input, weights_to_use, output); } - // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called - if(!_are_weights_reshaped && !retain_internal_weights) - { - // Allocate the tensor for the weights reshaped - _reshape_weights_output.allocator()->allocate(); - } - ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0); _are_weights_reshaped = _are_weights_reshaped || retain_internal_weights; } void GCFullyConnectedLayer::run() { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - _are_weights_reshaped = true; - _reshape_weights_kernel.run(); - } + prepare(); _memory_group.acquire(); @@ -187,3 +176,21 @@ void GCFullyConnectedLayer::run() _memory_group.release(); } + +void GCFullyConnectedLayer::prepare() +{ + // Reshape of the weights (happens only once) + if(!_are_weights_reshaped) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_kernel.run(); + + // Mark original weights tensor as unused + _original_weights->mark_as_unused(); + + _are_weights_reshaped = true; + } +} \ No newline at end of file diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp index 79f8f71713..8ae91ee82c 100644 --- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp +++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp @@ -73,8 +73,8 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCT } // namespace GCGEMM::GCGEMM(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false), - _is_first_run(true), _reshape_b_only_on_first_run(false) + : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false), + _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -87,6 +87,8 @@ void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor * // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = false; + _original_b = b; const IGCTensor *matrix_a = a; const IGCTensor *matrix_b = b; @@ -136,7 +138,10 @@ void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor * { // Allocate intermediate tensors _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } // Configure matrix addition kernel @@ -155,23 +160,21 @@ Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTen void GCGEMM::run() { + prepare(); + _memory_group.acquire(); + if(_is_interleaved_transposed) { // Run interleave kernel GCScheduler::get().dispatch(_interleave_kernel, false); - if(_is_first_run) - { - // Run transpose kernel - GCScheduler::get().dispatch(_transpose_kernel, false); - _is_first_run = false; - } - else if(!_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run transpose kernel GCScheduler::get().dispatch(_transpose_kernel, false); } + GCScheduler::get().memory_barrier(); } @@ -184,5 +187,27 @@ void GCGEMM::run() GCScheduler::get().memory_barrier(); GCScheduler::get().dispatch(_ma_kernel); } + _memory_group.release(); } + +void GCGEMM::prepare() +{ + if(!_is_prepared) + { + if(_is_interleaved_transposed && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run transpose kernel + _tmp_b.allocator()->allocate(); + GCScheduler::get().dispatch(_transpose_kernel, false); + GCScheduler::get().memory_barrier(); + + // Mark original weights tensor as unused + _original_b->mark_as_unused(); + } + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 96ac95f00c..4018407153 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -155,6 +155,12 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo * void NEConvolutionLayer::run() { + prepare(); _function->run(); } + +void NEConvolutionLayer::prepare() +{ + _function->prepare(); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 40ada8f6cf..8051d6da0e 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -38,7 +38,8 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr memor _scaled_output(), _input(nullptr), _info(), - _inner_border() + _inner_border(), + _is_prepared(false) { } @@ -104,6 +105,7 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con _input = input; _info = info; _inner_border = std::make_pair(inner_border_right, inner_border_top); + _is_prepared = false; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; @@ -132,13 +134,21 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con void NEDeconvolutionLayer::run() { + prepare(); + _memory_group.acquire(); - // Run upsample kernel _upsample_f.run(); - - // Run convolution layer _conv_f.run(); _memory_group.release(); +} + +void NEDeconvolutionLayer::prepare() +{ + if(!_is_prepared) + { + _conv_f.prepare(); + _is_prepared = true; + } } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 0a977ad08d..83c3e217f3 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -172,7 +172,7 @@ void NEDepthwiseConvolutionLayer3x3::run() NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer() : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), - _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr) + _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr) { } @@ -187,7 +187,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh const size_t weights_z = weights->info()->dimension(2); _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; // Should bias be appended ? @@ -260,24 +260,12 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh // Allocate intermediate tensors _input_reshaped.allocator()->allocate(); - _weights_reshaped.allocator()->allocate(); _v2mm_output.allocator()->allocate(); } void NEDepthwiseConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); - NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX); - _is_first_run = false; - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); NEScheduler::get().schedule(&_im2col_kernel, Window::DimX); NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX); @@ -288,3 +276,19 @@ void NEDepthwiseConvolutionLayer::run() NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX); } } + +void NEDepthwiseConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run reshape and mark original weights as unused + _weights_reshaped.allocator()->allocate(); + NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); + NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX); + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp index d70a6689ac..da2e49c730 100644 --- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITens void NEDepthwiseSeparableConvolutionLayer::run() { + prepare(); + _depthwise_conv.run(); _pointwise_conv.run(); +} + +void NEDepthwiseSeparableConvolutionLayer::prepare() +{ + _depthwise_conv.prepare(); + _pointwise_conv.prepare(); } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 958d081fd2..5b9f182bcb 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -131,8 +131,8 @@ void NEFullyConnectedLayerReshapeWeights::run() } NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), - _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr) + : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_function(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), + _interleave4x4_output(), _reshape_weights_output(), _original_weights(nullptr), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _is_prepared(false) { } @@ -163,16 +163,16 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions; const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions); - _original_weights = weights; - _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1); - _are_weights_reshaped = are_weights_reshaped; - _accumulate_biases = biases != nullptr; - _is_batched_fc_layer = num_batch_dimensions > 0; + _original_weights = weights; + _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1); + _accumulate_biases = biases != nullptr; + _is_batched_fc_layer = num_batch_dimensions > 0; + _is_prepared = are_weights_reshaped || (!transpose_weights && !_is_batched_fc_layer); const size_t interleave_width = 16 / input->info()->element_size(); const ITensor *weights_to_use = weights; - if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer)) + if(!_is_prepared) { weights_to_use = &_reshape_weights_output; @@ -181,7 +181,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh _is_batched_fc_layer, interleave_width))); // Reshape the weights - _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer); + _reshape_weights_function.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer); } const ITensor *multiply_input = input; @@ -220,13 +220,6 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh _accumulate_biases_kernel.configure(output, biases); } - // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called - if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer)) - { - // Allocate the tensor for the weights reshaped - _reshape_weights_output.allocator()->allocate(); - } - if(_linearize_input) { _im2col_output.allocator()->allocate(); @@ -322,17 +315,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void NEFullyConnectedLayer::run() { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _are_weights_reshaped = true; - _reshape_weights_kernel.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -359,3 +342,20 @@ void NEFullyConnectedLayer::run() _memory_group.release(); } + +void NEFullyConnectedLayer::prepare() +{ + // Reshape of the weights (happens only once) + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshape, clean internal tensors and mark original weights tensor as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + _reshape_weights_function = NEFullyConnectedLayerReshapeWeights(); + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 9168ed4327..a98309d304 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -40,7 +40,7 @@ namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(), - _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) + _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -63,8 +63,11 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe } // Check if we need to reshape the matrix B only on the first run + _is_prepared = false; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + _original_b = b; + _asm_glue._optimised_kernel = nullptr; const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f) && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue); @@ -128,7 +131,10 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // Allocate once the all configure methods have been called _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } // Configure matrix addition kernel if(beta != 0 && c != nullptr) @@ -142,28 +148,24 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe void NEGEMM::run() { - _memory_group.acquire(); + prepare(); if(_asm_glue._optimised_kernel != nullptr) { + _memory_group.acquire(); _asm_glue.run(); _memory_group.release(); } else { + _memory_group.acquire(); + if(!_run_vector_matrix_multiplication) { // Run interleave kernel NEScheduler::get().schedule(&_interleave_kernel, Window::DimY); - if(_is_first_run) - { - // Run transpose kernel - NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); - - _is_first_run = false; - } - else if(!_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run transpose kernel NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); @@ -181,4 +183,28 @@ void NEGEMM::run() } } } + +void NEGEMM::prepare() +{ + if(!_is_prepared) + { + if(_asm_glue._optimised_kernel) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _asm_glue.prepare(); + _original_b->mark_as_unused(); + } + else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue._optimised_kernel) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); + _original_b->mark_as_unused(); + } + + _is_prepared = true; + } +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 303691aa7a..d4400b8864 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -231,7 +231,7 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptrinfo()->fixed_point_position(); const ITensor *biases_to_use = (_append_bias) ? biases : nullptr; @@ -442,12 +443,6 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); - // Allocate intermediate tensor - if(!_are_weights_reshaped) - { - _weights_reshaped.allocator()->allocate(); - } - //Configure Activation Layer if(_is_activationlayer_enabled) { @@ -585,17 +580,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI void NEGEMMConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _are_weights_reshaped = true; - _reshape_weights.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -610,11 +595,6 @@ void NEGEMMConvolutionLayer::run() if(_asm_glue._optimised_kernel != nullptr) { _asm_glue.run(); - // Release weights in case buffer is pretransposed - if(!_weights_reshaped.is_used()) - { - _weights_reshaped.allocator()->free(); - } } else { @@ -659,4 +639,43 @@ void NEGEMMConvolutionLayer::run() _memory_group.release(); } + +void NEGEMMConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Run weights reshaping (Runs once for every configure) + if(!_are_weights_reshaped) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + _reshape_weights = NEConvolutionLayerReshapeWeights(); + _original_weights->mark_as_unused(); + _are_weights_reshaped = true; + } + + // Run GEMM prepare stage + if(_asm_glue._optimised_kernel) + { + _asm_glue.prepare(); + } + else + { + if(_is_quantized) + { + _mm_gemmlowp.prepare(); + } + } + + // Release weights in case buffer is pretransposed + if(!_weights_reshaped.is_used()) + { + _weights_reshaped.allocator()->free(); + } + + _is_prepared = true; + } +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 2e06fa2ef4..a92ffa7c7b 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -42,8 +42,8 @@ using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0), - _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false) + _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _original_b(nullptr), _a_offset(0), _b_offset(0), + _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -52,23 +52,32 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info)); + // Clear state + _mtx_a_reshape_kernel = nullptr; + _mtx_b_reshape_kernel = nullptr; + _asm_glue_signed._optimised_kernel = nullptr; + _asm_glue_unsigned._optimised_kernel = nullptr; + + // Set internal variables _a_offset = a->info()->quantization_info().offset; _b_offset = b->info()->quantization_info().offset; _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = false; + _original_b = b; #ifdef __aarch64__ switch(a->info()->data_type()) { case DataType::S8: { - _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed); + _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed); break; } case DataType::QASYMM8: case DataType::U8: { - _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned); + _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned); break; } default: @@ -160,10 +169,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(!_dot_product_path && !_run_vector_matrix_multiplication) { _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } - if(_a_offset != 0) + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); } @@ -248,22 +260,21 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso void NEGEMMLowpMatrixMultiplyCore::run() { + prepare(); + _memory_group.acquire(); - // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction - if(!_run_vector_matrix_multiplication && !_dot_product_path) + // Reshape inputs + if(_mtx_a_reshape_kernel) { - if(_mtx_a_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - } - - if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run)) - { - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - } + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + } + if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); } + // Run GEMM if(_asm_glue_unsigned._optimised_kernel != nullptr) { _asm_glue_unsigned.run(); @@ -284,7 +295,7 @@ void NEGEMMLowpMatrixMultiplyCore::run() } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run)) + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); } @@ -293,6 +304,45 @@ void NEGEMMLowpMatrixMultiplyCore::run() NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); _memory_group.release(); +} - _is_first_run = false; +void NEGEMMLowpMatrixMultiplyCore::prepare() +{ + if(!_is_prepared) + { + // Run assembly reshape + if((_asm_glue_signed._optimised_kernel || _asm_glue_signed._optimised_kernel) && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + if(_asm_glue_unsigned._optimised_kernel != nullptr) + { + _asm_glue_unsigned.prepare(); + } + else if(_asm_glue_signed._optimised_kernel != nullptr) + { + _asm_glue_signed.prepare(); + } + _original_b->mark_as_unused(); + } + // Run non-assembly reshape + else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + _original_b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + _is_prepared = true; + } } diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp index 913acf86a2..0737bd2f73 100644 --- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp @@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_first_run(false), _original_weights(nullptr) + _is_prepared(false), _original_weights(nullptr) { } @@ -127,7 +127,7 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); bool _has_bias = (biases != nullptr); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; const unsigned int kernel_width = weights->info()->dimension(0); @@ -160,24 +160,13 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h)); // Allocate intermediate tensors - _weights_reshaped.allocator()->allocate(); _input_im2col_reshaped.allocator()->allocate(); _gemm_output.allocator()->allocate(); } void NELocallyConnectedLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _is_first_run = false; - NEScheduler::get().schedule(&_weights_reshape_kernel, 3); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -192,3 +181,18 @@ void NELocallyConnectedLayer::run() _memory_group.release(); } + +void NELocallyConnectedLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + NEScheduler::get().schedule(&_weights_reshape_kernel, 3); + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index d6bc5cfd9a..d9f6c0e0f8 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -113,7 +113,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), - _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false) + _workspace(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false) { } /* arm_compute */ @@ -139,9 +139,10 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); } - _weights = weights; - _input = input; - _output = output; + _weights = weights; + _input = input; + _output = output; + _is_prepared = false; std::unique_ptr> transform_input_kernel; std::unique_ptr> transform_weights_kernel; @@ -198,10 +199,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const Tensor4DShape in_shape(internal_get_input_shape(input)); const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. - constexpr size_t storage_alignment = 64; - const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; + constexpr size_t storage_alignment = 64; + + // Kernel Storage + const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _kernel_storage.allocator()->allocate(); + // Input storage const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size; _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8)); @@ -331,13 +335,9 @@ void NEWinogradConvolutionLayer::run() { const DataLayout data_layout = _input->info()->data_layout(); + prepare(); + _memory_group.acquire(); - if(!_reshaped_kernel) - { - _reshaped_kernel = true; - _permute_weights.run(); - NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); - } if(data_layout == DataLayout::NCHW) { @@ -491,4 +491,20 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen return Status{}; } +void NEWinogradConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Permute weights + _permute_weights.run(); + _weights->mark_as_unused(); + + // Transform weights + NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); + _weights_hwio.allocator()->free(); + + _is_prepared = true; + } +} + } // namespace arm_compute -- cgit v1.2.1