diff options
Diffstat (limited to 'src/runtime/CL')
-rw-r--r-- | src/runtime/CL/functions/CLDeconvolutionLayer.cpp | 18 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp | 35 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp | 10 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLGEMM.cpp | 6 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp | 35 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 68 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLLocallyConnectedLayer.cpp | 33 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLRNNLayer.cpp | 20 |
8 files changed, 155 insertions, 70 deletions
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp index 6c54b18b81..4c1ea5b9a2 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp @@ -38,7 +38,8 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), - _scaled_output() + _scaled_output(), + _is_prepared(false) { } @@ -104,6 +105,8 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top)); + _is_prepared = false; + _memory_group.manage(&_scaled_output); // configure scale function @@ -126,8 +129,21 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, void CLDeconvolutionLayer::run() { + prepare(); + _memory_group.acquire(); + _scale_f.run(); _conv_f.run(); + _memory_group.release(); } + +void CLDeconvolutionLayer::prepare() +{ + if(!_is_prepared) + { + _conv_f.prepare(); + _is_prepared = true; + } +} diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index c2b24e3c20..1815361a72 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -91,7 +91,7 @@ void CLDepthwiseConvolutionLayer3x3::run() CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer() : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), - _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr) + _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr) { } @@ -104,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w const size_t weights_h = weights->info()->dimension(1); const size_t weights_z = weights->info()->dimension(2); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); @@ -182,7 +182,6 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w // Allocate intermediate tensors _input_reshaped.allocator()->allocate(); - _weights_reshaped.allocator()->allocate(); _v2mm_output.allocator()->allocate(); } @@ -235,18 +234,7 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe void CLDepthwiseConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - CLScheduler::get().enqueue(_weights_reshape_kernel); - CLScheduler::get().enqueue(_v2mm_weights_fill_border); - _is_first_run = false; - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); CLScheduler::get().enqueue(_im2col_kernel); CLScheduler::get().enqueue(_v2mm_input_fill_border); @@ -257,3 +245,20 @@ void CLDepthwiseConvolutionLayer::run() CLScheduler::get().enqueue(_output_stage_kernel); } } + +void CLDepthwiseConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + CLScheduler::get().enqueue(_weights_reshape_kernel); + CLScheduler::get().enqueue(_v2mm_weights_fill_border); + _original_weights->mark_as_unused(); + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp index af2c6f0eb8..fa2c3affa3 100644 --- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,14 @@ void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICL void CLDepthwiseSeparableConvolutionLayer::run() { + prepare(); + _depthwise_conv.run(); _pointwise_conv.run(); +} + +void CLDepthwiseSeparableConvolutionLayer::prepare() +{ + _depthwise_conv.prepare(); + _pointwise_conv.prepare(); }
\ No newline at end of file diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index f9713bb586..bb76872700 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -84,12 +84,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info)); - // Store original b matrix - _original_b = b; - // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _is_prepared = false; + _original_b = b; const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; @@ -262,7 +260,7 @@ void CLGEMM::prepare() { if(_is_interleaved_transposed && _reshape_b_only_on_first_run) { - // Run transpose kernel + // Run transpose kernel and mark original weights tensor as unused _tmp_b.allocator()->allocate(); CLScheduler::get().enqueue(_transpose_kernel, false); _original_b->mark_as_unused(); diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 27bed44098..82710b6461 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -91,8 +91,7 @@ void CLConvolutionLayerReshapeWeights::run() CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(), - _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false), - _retain_internal_weights(false) + _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false) { } @@ -166,10 +165,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * dilation, act_info)); - _is_prepared = false; - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _retain_internal_weights = weights_info.retain_internal_weights(); + _is_prepared = weights_info.retain_internal_weights(); + _original_weights = weights; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); const DataType dt = input->info()->data_type(); @@ -408,23 +406,18 @@ void CLGEMMConvolutionLayer::prepare() { if(!_is_prepared) { - if(!_retain_internal_weights) - { - // Run weights reshaping and mark as unused - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - _weights_reshaped.allocator()->allocate(); - _reshape_weights.run(); - _original_weights->mark_as_unused(); - } + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + _original_weights->mark_as_unused(); - // Run GEMM prepare - if(!_is_quantized) + // Prepare GEMM + _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare(); + if(!_weights_reshaped.is_used()) { - _mm_gemm.prepare(); - if(!_weights_reshaped.is_used() && !_retain_internal_weights) - { - _weights_reshaped.allocator()->free(); - } + _weights_reshaped.allocator()->free(); } CLScheduler::get().queue().finish(); diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 711b006ede..94dc0e071c 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -59,8 +59,23 @@ inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_o } // namespace CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), - _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false) + : _memory_group(std::move(memory_manager)), + _mm_kernel(), + _mtx_a_reshape_kernel(), + _mtx_b_reshape_kernel(), + _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), + _offset_contribution_kernel(), + _vector_sum_col(), + _vector_sum_row(), + _tmp_a(), + _tmp_b(), + _original_b(nullptr), + _a_offset(0), + _b_offset(0), + _is_interleaved_transposed(true), + _reshape_b_only_on_first_run(false), + _is_prepared(false) { } @@ -70,6 +85,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor ARM_COMPUTE_UNUSED(gemm_info); ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info)); + _is_prepared = false; + _original_b = b; _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->info()->quantization_info().offset; _b_offset = b->info()->quantization_info().offset; @@ -149,10 +166,13 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_interleaved_transposed) { _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } - if(_a_offset != 0) + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { _vector_sum_col.allocator()->allocate(); } @@ -234,6 +254,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso void CLGEMMLowpMatrixMultiplyCore::run() { + prepare(); + _memory_group.acquire(); if(_is_interleaved_transposed) @@ -241,21 +263,17 @@ void CLGEMMLowpMatrixMultiplyCore::run() // Run reshape matrix A CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false); - if(_is_first_run || !_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run reshape matrix B CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); } } - // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once - if(_is_first_run || !_reshape_b_only_on_first_run) + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && !_reshape_b_only_on_first_run) { - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); - } + CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); } // Run matrix multiply @@ -271,6 +289,30 @@ void CLGEMMLowpMatrixMultiplyCore::run() CLScheduler::get().enqueue(_offset_contribution_kernel, true); _memory_group.release(); +} + +void CLGEMMLowpMatrixMultiplyCore::prepare() +{ + if(!_is_prepared) + { + if(_is_interleaved_transposed && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + _original_b->mark_as_unused(); + } - _is_first_run = false; + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); + } + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp index 31d5cd5a7e..d15e5dfa3d 100644 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp @@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), - _is_first_run(false), _original_weights(nullptr) + _is_prepared(false), _original_weights(nullptr) { } @@ -128,7 +128,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor bool _has_bias = (biases != nullptr); _original_weights = weights; - _is_first_run = true; + _is_prepared = false; const unsigned int kernel_width = weights->info()->dimension(0); const unsigned int kernel_height = weights->info()->dimension(1); @@ -160,7 +160,6 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h)); // Allocate intermediate tensors - _weights_reshaped.allocator()->allocate(); _input_im2col_reshaped.allocator()->allocate(); _gemm_output.allocator()->allocate(); @@ -169,17 +168,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor void CLLocallyConnectedLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _is_first_run = false; - CLScheduler::get().enqueue(_weights_reshape_kernel); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -194,3 +183,19 @@ void CLLocallyConnectedLayer::run() _memory_group.release(); } + +void CLLocallyConnectedLayer::prepare() +{ + if(!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights reshaping and mark original weights tensor as unused + _weights_reshaped.allocator()->allocate(); + CLScheduler::get().enqueue(_weights_reshape_kernel); + _original_weights->mark_as_unused(); + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp index 4843ba6364..0e1b9d5b58 100644 --- a/src/runtime/CL/functions/CLRNNLayer.cpp +++ b/src/runtime/CL/functions/CLRNNLayer.cpp @@ -36,7 +36,8 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output() + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), + _is_prepared(false) { } @@ -74,6 +75,8 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + _is_prepared = false; + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); @@ -100,7 +103,10 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con void CLRNNLayer::run() { + prepare(); + _memory_group.acquire(); + _fully_connected_kernel.run(); _gemm_state_f.run(); CLScheduler::get().enqueue(_add_kernel); @@ -108,5 +114,17 @@ void CLRNNLayer::run() // copy hidden out to output CLScheduler::get().enqueue(_copy_kernel); + _memory_group.release(); +} + +void CLRNNLayer::prepare() +{ + if(!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } }
\ No newline at end of file |