From 72219330fd85b1271e714d4ba894d6d8e26340c9 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 5 Jun 2018 14:56:06 +0100 Subject: COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES) Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255 Reviewed-by: Anthony Barbier Tested-by: Jenkins Reviewed-by: Pablo Tello Reviewed-by: Michele DiGiorgio --- src/runtime/NEON/functions/NEConvolutionLayer.cpp | 6 ++ .../NEON/functions/NEDeconvolutionLayer.cpp | 18 ++++- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 34 +++++---- .../NEDepthwiseSeparableConvolutionLayer.cpp | 10 ++- .../NEON/functions/NEFullyConnectedLayer.cpp | 54 ++++++------- src/runtime/NEON/functions/NEGEMM.cpp | 48 +++++++++--- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 65 ++++++++++------ .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 88 +++++++++++++++++----- .../NEON/functions/NELocallyConnectedLayer.cpp | 32 ++++---- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 40 +++++++--- 10 files changed, 269 insertions(+), 126 deletions(-) (limited to 'src/runtime/NEON') diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 96ac95f00c..4018407153 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -155,6 +155,12 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo * void NEConvolutionLayer::run() { + prepare(); _function->run(); } + +void NEConvolutionLayer::prepare() +{ + _function->prepare(); +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 40ada8f6cf..8051d6da0e 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -38,7 +38,8 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr memor _scaled_output(), _input(nullptr), _info(), - _inner_border() + _inner_border(), + _is_prepared(false) { } @@ -104,6 +105,7 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con _input = input; _info = info; _inner_border = std::make_pair(inner_border_right, inner_border_top); + _is_prepared = false; const unsigned int stride_x = info.stride().first; const unsigned int stride_y = info.stride().second; @@ -132,13 +134,21 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con void NEDeconvolutionLayer::run() { + prepare(); + _memory_group.acquire(); - // Run upsample kernel _upsample_f.run(); - - // Run convolution layer _conv_f.run(); _memory_group.release(); +} + +void NEDeconvolutionLayer::prepare() +{ + if(!_is_prepared) + { + _conv_f.prepare(); + _is_prepared = true; + } } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 0a977ad08d..83c3e217f3 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -172,7 +172,7 @@ void NEDepthwiseConvolutionLayer3x3::run() NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer() : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), - _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr) + _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr) { } @@ -187,7 +187,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh const size_t weights_z = weights->info()->dimension(2); _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; // Should bias be appended ? @@ -260,24 +260,12 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh // Allocate intermediate tensors _input_reshaped.allocator()->allocate(); - _weights_reshaped.allocator()->allocate(); _v2mm_output.allocator()->allocate(); } void NEDepthwiseConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); - NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX); - _is_first_run = false; - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); NEScheduler::get().schedule(&_im2col_kernel, Window::DimX); NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX); @@ -288,3 +276,19 @@ void NEDepthwiseConvolutionLayer::run() NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX); } } + +void NEDepthwiseConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run reshape and mark original weights as unused + _weights_reshaped.allocator()->allocate(); + NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX); + NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX); + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp index d70a6689ac..da2e49c730 100644 --- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITens void NEDepthwiseSeparableConvolutionLayer::run() { + prepare(); + _depthwise_conv.run(); _pointwise_conv.run(); +} + +void NEDepthwiseSeparableConvolutionLayer::prepare() +{ + _depthwise_conv.prepare(); + _pointwise_conv.prepare(); } \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 958d081fd2..5b9f182bcb 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -131,8 +131,8 @@ void NEFullyConnectedLayerReshapeWeights::run() } NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), - _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr) + : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_function(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), + _interleave4x4_output(), _reshape_weights_output(), _original_weights(nullptr), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _is_prepared(false) { } @@ -163,16 +163,16 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions; const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions); - _original_weights = weights; - _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1); - _are_weights_reshaped = are_weights_reshaped; - _accumulate_biases = biases != nullptr; - _is_batched_fc_layer = num_batch_dimensions > 0; + _original_weights = weights; + _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1); + _accumulate_biases = biases != nullptr; + _is_batched_fc_layer = num_batch_dimensions > 0; + _is_prepared = are_weights_reshaped || (!transpose_weights && !_is_batched_fc_layer); const size_t interleave_width = 16 / input->info()->element_size(); const ITensor *weights_to_use = weights; - if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer)) + if(!_is_prepared) { weights_to_use = &_reshape_weights_output; @@ -181,7 +181,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh _is_batched_fc_layer, interleave_width))); // Reshape the weights - _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer); + _reshape_weights_function.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer); } const ITensor *multiply_input = input; @@ -220,13 +220,6 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh _accumulate_biases_kernel.configure(output, biases); } - // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called - if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer)) - { - // Allocate the tensor for the weights reshaped - _reshape_weights_output.allocator()->allocate(); - } - if(_linearize_input) { _im2col_output.allocator()->allocate(); @@ -322,17 +315,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void NEFullyConnectedLayer::run() { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _are_weights_reshaped = true; - _reshape_weights_kernel.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -359,3 +342,20 @@ void NEFullyConnectedLayer::run() _memory_group.release(); } + +void NEFullyConnectedLayer::prepare() +{ + // Reshape of the weights (happens only once) + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshape, clean internal tensors and mark original weights tensor as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + _reshape_weights_function = NEFullyConnectedLayerReshapeWeights(); + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 9168ed4327..a98309d304 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -40,7 +40,7 @@ namespace arm_compute { NEGEMM::NEGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(), - _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) + _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -63,8 +63,11 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe } // Check if we need to reshape the matrix B only on the first run + _is_prepared = false; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + _original_b = b; + _asm_glue._optimised_kernel = nullptr; const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f) && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue); @@ -128,7 +131,10 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // Allocate once the all configure methods have been called _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } // Configure matrix addition kernel if(beta != 0 && c != nullptr) @@ -142,28 +148,24 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe void NEGEMM::run() { - _memory_group.acquire(); + prepare(); if(_asm_glue._optimised_kernel != nullptr) { + _memory_group.acquire(); _asm_glue.run(); _memory_group.release(); } else { + _memory_group.acquire(); + if(!_run_vector_matrix_multiplication) { // Run interleave kernel NEScheduler::get().schedule(&_interleave_kernel, Window::DimY); - if(_is_first_run) - { - // Run transpose kernel - NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); - - _is_first_run = false; - } - else if(!_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run transpose kernel NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); @@ -181,4 +183,28 @@ void NEGEMM::run() } } } + +void NEGEMM::prepare() +{ + if(!_is_prepared) + { + if(_asm_glue._optimised_kernel) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _asm_glue.prepare(); + _original_b->mark_as_unused(); + } + else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue._optimised_kernel) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(&_transpose_kernel, Window::DimY); + _original_b->mark_as_unused(); + } + + _is_prepared = true; + } +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 303691aa7a..d4400b8864 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -231,7 +231,7 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptrinfo()->fixed_point_position(); const ITensor *biases_to_use = (_append_bias) ? biases : nullptr; @@ -442,12 +443,6 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); - // Allocate intermediate tensor - if(!_are_weights_reshaped) - { - _weights_reshaped.allocator()->allocate(); - } - //Configure Activation Layer if(_is_activationlayer_enabled) { @@ -585,17 +580,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI void NEGEMMConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _are_weights_reshaped = true; - _reshape_weights.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -610,11 +595,6 @@ void NEGEMMConvolutionLayer::run() if(_asm_glue._optimised_kernel != nullptr) { _asm_glue.run(); - // Release weights in case buffer is pretransposed - if(!_weights_reshaped.is_used()) - { - _weights_reshaped.allocator()->free(); - } } else { @@ -659,4 +639,43 @@ void NEGEMMConvolutionLayer::run() _memory_group.release(); } + +void NEGEMMConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Run weights reshaping (Runs once for every configure) + if(!_are_weights_reshaped) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + _reshape_weights = NEConvolutionLayerReshapeWeights(); + _original_weights->mark_as_unused(); + _are_weights_reshaped = true; + } + + // Run GEMM prepare stage + if(_asm_glue._optimised_kernel) + { + _asm_glue.prepare(); + } + else + { + if(_is_quantized) + { + _mm_gemmlowp.prepare(); + } + } + + // Release weights in case buffer is pretransposed + if(!_weights_reshaped.is_used()) + { + _weights_reshaped.allocator()->free(); + } + + _is_prepared = true; + } +} } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 2e06fa2ef4..a92ffa7c7b 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -42,8 +42,8 @@ using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0), - _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false) + _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _original_b(nullptr), _a_offset(0), _b_offset(0), + _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -52,23 +52,32 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info)); + // Clear state + _mtx_a_reshape_kernel = nullptr; + _mtx_b_reshape_kernel = nullptr; + _asm_glue_signed._optimised_kernel = nullptr; + _asm_glue_unsigned._optimised_kernel = nullptr; + + // Set internal variables _a_offset = a->info()->quantization_info().offset; _b_offset = b->info()->quantization_info().offset; _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = false; + _original_b = b; #ifdef __aarch64__ switch(a->info()->data_type()) { case DataType::S8: { - _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed); + _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed); break; } case DataType::QASYMM8: case DataType::U8: { - _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned); + _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned); break; } default: @@ -160,10 +169,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(!_dot_product_path && !_run_vector_matrix_multiplication) { _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } - if(_a_offset != 0) + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); } @@ -248,22 +260,21 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso void NEGEMMLowpMatrixMultiplyCore::run() { + prepare(); + _memory_group.acquire(); - // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction - if(!_run_vector_matrix_multiplication && !_dot_product_path) + // Reshape inputs + if(_mtx_a_reshape_kernel) { - if(_mtx_a_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - } - - if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run)) - { - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); - } + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + } + if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); } + // Run GEMM if(_asm_glue_unsigned._optimised_kernel != nullptr) { _asm_glue_unsigned.run(); @@ -284,7 +295,7 @@ void NEGEMMLowpMatrixMultiplyCore::run() } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run)) + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); } @@ -293,6 +304,45 @@ void NEGEMMLowpMatrixMultiplyCore::run() NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); _memory_group.release(); +} - _is_first_run = false; +void NEGEMMLowpMatrixMultiplyCore::prepare() +{ + if(!_is_prepared) + { + // Run assembly reshape + if((_asm_glue_signed._optimised_kernel || _asm_glue_signed._optimised_kernel) && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + if(_asm_glue_unsigned._optimised_kernel != nullptr) + { + _asm_glue_unsigned.prepare(); + } + else if(_asm_glue_signed._optimised_kernel != nullptr) + { + _asm_glue_signed.prepare(); + } + _original_b->mark_as_unused(); + } + // Run non-assembly reshape + else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + _original_b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + _is_prepared = true; + } } diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp index 913acf86a2..0737bd2f73 100644 --- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp @@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_first_run(false), _original_weights(nullptr) + _is_prepared(false), _original_weights(nullptr) { } @@ -127,7 +127,7 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info)); bool _has_bias = (biases != nullptr); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; const unsigned int kernel_width = weights->info()->dimension(0); @@ -160,24 +160,13 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h)); // Allocate intermediate tensors - _weights_reshaped.allocator()->allocate(); _input_im2col_reshaped.allocator()->allocate(); _gemm_output.allocator()->allocate(); } void NELocallyConnectedLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _is_first_run = false; - NEScheduler::get().schedule(&_weights_reshape_kernel, 3); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -192,3 +181,18 @@ void NELocallyConnectedLayer::run() _memory_group.release(); } + +void NELocallyConnectedLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + NEScheduler::get().schedule(&_weights_reshape_kernel, 3); + _original_weights->mark_as_unused(); + + _is_prepared = true; + } +} diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index d6bc5cfd9a..d9f6c0e0f8 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -113,7 +113,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), - _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false) + _workspace(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false) { } /* arm_compute */ @@ -139,9 +139,10 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); } - _weights = weights; - _input = input; - _output = output; + _weights = weights; + _input = input; + _output = output; + _is_prepared = false; std::unique_ptr> transform_input_kernel; std::unique_ptr> transform_weights_kernel; @@ -198,10 +199,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const Tensor4DShape in_shape(internal_get_input_shape(input)); const size_t data_type_size = input->info()->element_size(); // Get the memory required to instantiate a new Winograd operator. - constexpr size_t storage_alignment = 64; - const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; + constexpr size_t storage_alignment = 64; + + // Kernel Storage + const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8)); _kernel_storage.allocator()->allocate(); + // Input storage const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size; _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8)); @@ -331,13 +335,9 @@ void NEWinogradConvolutionLayer::run() { const DataLayout data_layout = _input->info()->data_layout(); + prepare(); + _memory_group.acquire(); - if(!_reshaped_kernel) - { - _reshaped_kernel = true; - _permute_weights.run(); - NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); - } if(data_layout == DataLayout::NCHW) { @@ -491,4 +491,20 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen return Status{}; } +void NEWinogradConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Permute weights + _permute_weights.run(); + _weights->mark_as_unused(); + + // Transform weights + NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX); + _weights_hwio.allocator()->free(); + + _is_prepared = true; + } +} + } // namespace arm_compute -- cgit v1.2.1