From e043767d068da389308507011d944e6db9e4d676 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 2 May 2018 14:07:55 +0100 Subject: COMPMID-920: Introduce prepare() stage Change-Id: I08ddb7f6e061178e7566518b48e4e18f8f078596 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129825 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- arm_compute/graph/Workload.h | 11 ++-- arm_compute/graph/detail/ExecutionHelpers.h | 21 ++++++++ .../runtime/CL/functions/CLConvolutionLayer.h | 3 +- .../runtime/CL/functions/CLFullyConnectedLayer.h | 1 + arm_compute/runtime/CL/functions/CLGEMM.h | 3 +- .../runtime/CL/functions/CLGEMMConvolutionLayer.h | 3 +- .../CL/functions/CLWinogradConvolutionLayer.h | 12 ++++- arm_compute/runtime/IFunction.h | 14 +++-- examples/graph_vgg19.cpp | 6 +-- src/graph/GraphManager.cpp | 48 ++++++++++++----- src/graph/Workload.cpp | 8 +++ src/graph/detail/ExecutionHelpers.cpp | 63 ++++++++++++++++++++-- src/runtime/CL/functions/CLConvolutionLayer.cpp | 6 +++ src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 46 +++++++++------- src/runtime/CL/functions/CLGEMM.cpp | 39 ++++++++------ .../CL/functions/CLGEMMConvolutionLayer.cpp | 52 +++++++++--------- .../CL/functions/CLWinogradConvolutionLayer.cpp | 40 ++++++++------ 17 files changed, 275 insertions(+), 101 deletions(-) diff --git a/arm_compute/graph/Workload.h b/arm_compute/graph/Workload.h index b19c932636..11bb22ea9a 100644 --- a/arm_compute/graph/Workload.h +++ b/arm_compute/graph/Workload.h @@ -37,6 +37,7 @@ namespace graph class ITensorHandle; class INode; class Tensor; +class Graph; /** Execution task * @@ -52,14 +53,18 @@ struct ExecutionTask /** Function operator */ void operator()(); + + /** Prepare execution task */ + void prepare(); }; /** Execution workload */ struct ExecutionWorkload { - std::vector inputs = {}; /**< Input handles */ - std::vector outputs = {}; /**< Output handles */ - std::vector tasks = {}; /**< Execution workload */ + std::vector inputs = {}; /**< Input handles */ + std::vector outputs = {}; /**< Output handles */ + std::vector tasks = {}; /**< Execution workload */ + Graph *graph = nullptr; /**< Graph bound to the workload */ }; } // namespace graph } // namespace arm_compute diff --git a/arm_compute/graph/detail/ExecutionHelpers.h b/arm_compute/graph/detail/ExecutionHelpers.h index 52304d6836..a868df8a5d 100644 --- a/arm_compute/graph/detail/ExecutionHelpers.h +++ b/arm_compute/graph/detail/ExecutionHelpers.h @@ -35,6 +35,7 @@ class Graph; class GraphContext; class ExecutionWorkload; class Tensor; +class INode; namespace detail { @@ -45,6 +46,21 @@ void default_initialize_backends(); * @param[in] g Graph to configure */ void configure_all_tensors(Graph &g); +/** Allocates all input tensors of a node. + * + * @param[in] node Node to allocate the input tensor of + */ +void allocate_all_input_tensors(INode &node); +/** Allocates all output tensors of a node. + * + * @param[in] node Node to allocate the output tensor of + */ +void allocate_all_output_tensors(INode &node); +/** Allocates const tensor of a given graph + * + * @param[in] g Graph to allocate the tensors + */ +void allocate_const_tensors(Graph &g); /** Allocates all tensors of a graph * * @param[in] g Graph to allocate the tensors @@ -88,6 +104,11 @@ void call_all_input_node_accessors(ExecutionWorkload &workload); * @param[in] workload Workload to execute */ void call_all_output_node_accessors(ExecutionWorkload &workload); +/** Prepares all tasks for execution + * + * @param[in] workload Workload to prepare + */ +void prepare_all_tasks(ExecutionWorkload &workload); /** Executes all tasks of a workload * * @param[in] workload Workload to execute diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index 5c05334a56..97998b5595 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -106,10 +106,11 @@ public: const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false); // Inherited methods overridden: void run() override; + void prepare() override; private: std::shared_ptr _memory_manager; - std::unique_ptr _function; /**< Function to run */ + std::unique_ptr _function; }; } #endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 67c0467f3a..7fb5af9229 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -109,6 +109,7 @@ public: //Inherited methods override void run() override; + void prepare() override; private: void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index c5d7b86384..60ff32c6fa 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -100,6 +100,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: CLMemoryGroup _memory_group; @@ -112,8 +113,8 @@ private: const ICLTensor *_original_b; bool _is_interleaved_transposed; bool _run_addition; - bool _is_first_run; bool _reshape_b_only_on_first_run; + bool _is_prepared; }; } diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h index a24ac3ac1f..3dde52989b 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h @@ -153,6 +153,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: /** Configures the appropriate matrix multiply routine @@ -192,8 +193,8 @@ private: CLTensor _tmp_output; bool _is_quantized; - bool _is_first_run; bool _is_activationlayer_enabled; + bool _is_prepared; }; } #endif /* __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h index a27976959c..594d6028e1 100644 --- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h @@ -49,6 +49,14 @@ class CLWinogradConvolutionLayer : public IFunction public: /** Default constructor */ CLWinogradConvolutionLayer(std::shared_ptr memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLWinogradConvolutionLayer(const CLWinogradConvolutionLayer &) = delete; + /** Default move constructor */ + CLWinogradConvolutionLayer(CLWinogradConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLWinogradConvolutionLayer &operator=(const CLWinogradConvolutionLayer &) = delete; + /** Default move assignment operator */ + CLWinogradConvolutionLayer &operator=(CLWinogradConvolutionLayer &&) = default; /** Set the input and output tensors. * * @note: This function only works with 3x3 and 5x5 kernels along with unit strides @@ -92,6 +100,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: CLMemoryGroup _memory_group; @@ -103,7 +112,8 @@ private: CLTensor _input0; CLTensor _input1; CLTensor _batched_mm_output; - bool _is_first_run; + const ICLTensor *_original_weights; + bool _is_prepared; bool _is_activationlayer_enabled; }; } diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h index a4e7ed15e0..f64b2be998 100644 --- a/arm_compute/runtime/IFunction.h +++ b/arm_compute/runtime/IFunction.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,8 @@ namespace arm_compute class IFunction { public: + /** Destructor */ + virtual ~IFunction() = default; /** Run the kernels contained in the function * * For NEON kernels: @@ -43,12 +45,18 @@ public: * - The queue is then flushed. * * @note The function will not block until the kernels are executed. It is the user's responsibility to wait. + * @note Will call prepare() on first run if hasn't been done */ virtual void run() = 0; - /** Destructor + /** Prepare the function for executing + * + * Any one off pre-processing step required by the function is handled here * + * @note Prepare stage might not need all the function's buffers' backing memory to be available in order to execute */ - virtual ~IFunction() = default; + virtual void prepare() + { + } }; } #endif /*__ARM_COMPUTE_IFUNCTION_H__ */ diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp index 606b0e0142..28e1a0fe04 100644 --- a/examples/graph_vgg19.cpp +++ b/examples/graph_vgg19.cpp @@ -51,9 +51,8 @@ public: std::unique_ptr preprocessor = arm_compute::support::cpp14::make_unique(mean_rgb); // Set target. 0 (NEON), 1 (OpenCL), 2 (OpenCL with Tuner). By default it is NEON - const int target = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0; - Target target_hint = set_target_hint(target); - ConvolutionMethod convolution_hint = ConvolutionMethod::DIRECT; + const int target = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0; + Target target_hint = set_target_hint(target); // Parse arguments if(argc < 2) @@ -88,7 +87,6 @@ public: } graph << target_hint - << convolution_hint << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), DataType::F32), get_input_accessor(image, std::move(preprocessor))) // Layer 1 diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp index c0720ac685..fa7dfdf8f8 100644 --- a/src/graph/GraphManager.cpp +++ b/src/graph/GraphManager.cpp @@ -74,23 +74,47 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager & auto workload = detail::configure_all_nodes(graph, ctx); ARM_COMPUTE_ERROR_ON_MSG(workload.tasks.empty(), "Could not configure all nodes!"); - // Allocate all tensors - detail::allocate_all_tensors(graph); + // TODO (COMPMID-920) : Update prepare for NEON/GC + if(forced_target == Target::CL) + { + // Allocate const tensors and call accessors + detail::allocate_const_tensors(graph); + detail::call_all_const_node_accessors(graph); + + // Prepare graph + detail::prepare_all_tasks(workload); + + // Allocate all tensors + detail::allocate_all_tensors(graph); + + // Finalize Graph context + ctx.finalize(); - // Call accessors on all Const nodes - detail::call_all_const_node_accessors(graph); + // Register graph + _workloads.insert(std::make_pair(graph.id(), std::move(workload))); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl); + } + else + { + // Allocate all tensors + detail::allocate_all_tensors(graph); - _workloads.insert(std::make_pair(graph.id(), std::move(workload))); - ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl); + // Call accessors on all Const nodes + detail::call_all_const_node_accessors(graph); - // Finalize Graph context - ctx.finalize(); + // Finalize Graph context + ctx.finalize(); - // Make first run - execute_graph(graph); + // Register graph + _workloads.insert(std::make_pair(graph.id(), std::move(workload))); + ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl); - // Release all unused const nodes - detail::release_unused_tensors(graph); + // Make first run + execute_graph(graph); + + // Release all unused const tensors + detail::release_unused_tensors(graph); + } } void GraphManager::execute_graph(Graph &graph) diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp index c53a8a42da..f350bbf625 100644 --- a/src/graph/Workload.cpp +++ b/src/graph/Workload.cpp @@ -37,5 +37,13 @@ void ExecutionTask::operator()() task->run(); } } + +void ExecutionTask::prepare() +{ + if(task) + { + task->prepare(); + } +} } // namespace graph } // namespace arm_compute \ No newline at end of file diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp index 5a50728164..0bb47f2b33 100644 --- a/src/graph/detail/ExecutionHelpers.cpp +++ b/src/graph/detail/ExecutionHelpers.cpp @@ -61,15 +61,61 @@ void configure_all_tensors(Graph &g) } } +void allocate_all_input_tensors(INode &node) +{ + for(unsigned int i = 0; i < node.num_inputs(); ++i) + { + Tensor *tensor = node.input(i); + if(tensor != nullptr && !tensor->bound_edges().empty()) + { + ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!"); + tensor->handle()->allocate(); + } + } +} + +void allocate_all_output_tensors(INode &node) +{ + for(unsigned int i = 0; i < node.num_outputs(); ++i) + { + Tensor *tensor = node.output(i); + if(tensor != nullptr && !tensor->bound_edges().empty()) + { + ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!"); + tensor->handle()->allocate(); + } + } +} + +void allocate_const_tensors(Graph &g) +{ + for(auto &node : g.nodes()) + { + if(node != nullptr) + { + switch(node->type()) + { + case NodeType::Const: + case NodeType::Input: + allocate_all_output_tensors(*node); + break; + case NodeType::Output: + allocate_all_input_tensors(*node); + default: + break; + } + } + } +} + void allocate_all_tensors(Graph &g) { auto &tensors = g.tensors(); for(auto &tensor : tensors) { - if(tensor && !tensor->bound_edges().empty()) + if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used()) { - ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!"); tensor->handle()->allocate(); } } @@ -96,7 +142,8 @@ void validate_all_nodes(Graph &g) ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx) { ExecutionWorkload workload; - auto &nodes = g.nodes(); + workload.graph = &g; + auto &nodes = g.nodes(); // Create tasks for(auto &node : nodes) @@ -176,6 +223,16 @@ void call_all_input_node_accessors(ExecutionWorkload &workload) } } +void prepare_all_tasks(ExecutionWorkload &workload) +{ + ARM_COMPUTE_ERROR_ON(workload.graph == nullptr); + for(auto &task : workload.tasks) + { + task.prepare(); + release_unused_tensors(*workload.graph); + } +} + void call_all_tasks(ExecutionWorkload &workload) { for(auto &task : workload.tasks) diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index 83281e1747..3d4fb113b2 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -135,5 +135,11 @@ ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo * void CLConvolutionLayer::run() { + prepare(); _function->run(); } + +void CLConvolutionLayer::prepare() +{ + _function->prepare(); +} diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 9b3bf48bca..151fa1b5fa 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -220,13 +220,6 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset); _gemmlowp_output.allocator()->allocate(); } - - // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called - if(!_are_weights_reshaped) - { - // Allocate the tensor for the weights reshaped - _reshape_weights_output.allocator()->allocate(); - } } Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped) @@ -311,17 +304,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn void CLFullyConnectedLayer::run() { - // Reshape of the weights (happens only once) - if(!_are_weights_reshaped) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _are_weights_reshaped = true; - _reshape_weights_kernel.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -356,3 +339,30 @@ void CLFullyConnectedLayer::run() _memory_group.release(); } + +void CLFullyConnectedLayer::prepare() +{ + // Reshape of the weights (happens only once) + if(!_are_weights_reshaped) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_kernel.run(); + _original_weights->mark_as_unused(); + + // Prepare GEMM prepare and release unused weights + if(!_is_quantized) + { + _mm_gemm.prepare(); + if(!_reshape_weights_output.is_used()) + { + _reshape_weights_output.allocator()->free(); + } + } + + CLScheduler::get().queue().finish(); + _are_weights_reshaped = true; + } +} diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 37fa0c5ba2..e735adba39 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -98,7 +98,7 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLT CLGEMM::CLGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false), - _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) + _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -114,6 +114,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = false; const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; @@ -169,7 +170,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * { // Allocate intermediate tensors _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); + if(!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } } // Configure matrix addition kernel @@ -188,6 +192,8 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTen void CLGEMM::run() { + prepare(); + _memory_group.acquire(); if(_is_interleaved_transposed) @@ -195,18 +201,7 @@ void CLGEMM::run() // Run interleave kernel CLScheduler::get().enqueue(_interleave_kernel, false); - if(_is_first_run) - { - // Run transpose kernel - CLScheduler::get().enqueue(_transpose_kernel, false); - - // Mark original b matrix as unused - if(_reshape_b_only_on_first_run) - { - _original_b->mark_as_unused(); - } - } - else if(!_reshape_b_only_on_first_run) + if(!_reshape_b_only_on_first_run) { // Run transpose kernel CLScheduler::get().enqueue(_transpose_kernel, false); @@ -223,6 +218,20 @@ void CLGEMM::run() } _memory_group.release(); +} - _is_first_run = false; +void CLGEMM::prepare() +{ + if(!_is_prepared) + { + if(_is_interleaved_transposed && _reshape_b_only_on_first_run) + { + // Run transpose kernel + _tmp_b.allocator()->allocate(); + CLScheduler::get().enqueue(_transpose_kernel, false); + _original_b->mark_as_unused(); + } + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index cf8a6a8a78..610eec4d67 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -91,7 +91,7 @@ void CLConvolutionLayerReshapeWeights::run() CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(), - _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true), _is_activationlayer_enabled(false) + _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false) { } @@ -165,7 +165,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * dilation, act_info)); - _is_first_run = true; + _is_prepared = false; _original_weights = weights; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); @@ -258,9 +258,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one"); - // Allocate intermediate tensor - _weights_reshaped.allocator()->allocate(); - //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); @@ -305,7 +302,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI unsigned int mat_weights_cols = weights->dimension(3); unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element; - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized? nullptr:biases, nullptr)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr)); // Create tensor info for im2col reshaped inputs const unsigned int mat_input_cols = mat_weights_rows; @@ -369,16 +366,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI void CLGEMMConvolutionLayer::run() { - // Run weights reshaping (Runs once for every configure) - if(_is_first_run) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - _reshape_weights.run(); - - // Mark original weights tensor as unused - _original_weights->mark_as_unused(); - } + prepare(); _memory_group.acquire(); @@ -398,13 +386,6 @@ void CLGEMMConvolutionLayer::run() { // Run gemm _mm_gemm.run(); - - // Release reshaped weights if marked unused by CLGEMM - if(_is_first_run && !_weights_reshaped.is_used()) - { - CLScheduler::get().queue().finish(); - _weights_reshaped.allocator()->free(); - } } // Reshape output matrix @@ -417,6 +398,29 @@ void CLGEMMConvolutionLayer::run() } _memory_group.release(); +} - _is_first_run = false; +void CLGEMMConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Run weights reshaping and mark as unused + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + _weights_reshaped.allocator()->allocate(); + _reshape_weights.run(); + _original_weights->mark_as_unused(); + + // Run GEMM prepare + if(!_is_quantized) + { + _mm_gemm.prepare(); + if(!_weights_reshaped.is_used()) + { + _weights_reshaped.allocator()->free(); + } + } + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp index 5ff4fbceee..025a16b4fb 100644 --- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -69,7 +69,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(), - _is_first_run(true), _is_activationlayer_enabled(false) + _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false) { } @@ -97,6 +97,9 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we conv_info, input->info()->data_layout()); + _is_prepared = false; + _original_weights = weights; + // Manage intermediate tensors _memory_group.manage(&_input0); _memory_group.manage(&_batched_mm_output); @@ -124,7 +127,6 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we // Allocate temporary tensors _input0.allocator()->allocate(); - _input1.allocator()->allocate(); _batched_mm_output.allocator()->allocate(); } @@ -182,11 +184,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen void CLWinogradConvolutionLayer::run() { - if(_is_first_run) - { - // Run filter transform - CLScheduler::get().enqueue(_filter_transform, false); - } + prepare(); _memory_group.acquire(); @@ -196,13 +194,6 @@ void CLWinogradConvolutionLayer::run() // Run batched matrix multiplication _batched_mm.run(); - // Release reshaped weights if marked unused by CLGEMM - if(_is_first_run && !_input1.is_used()) - { - CLScheduler::get().queue().finish(); - _input1.allocator()->free(); - } - // Run output transform CLScheduler::get().enqueue(_output_transform); @@ -212,6 +203,25 @@ void CLWinogradConvolutionLayer::run() } _memory_group.release(); +} + +void CLWinogradConvolutionLayer::prepare() +{ + if(!_is_prepared) + { + // Run filter transform and mark original weights as unused + _input1.allocator()->allocate(); + CLScheduler::get().enqueue(_filter_transform, false); + _original_weights->mark_as_unused(); + + // Prepare GEMM and release reshaped weights if marked unused by CLGEMM + _batched_mm.prepare(); + if(!_input1.is_used()) + { + _input1.allocator()->free(); + } - _is_first_run = false; + CLScheduler::get().queue().finish(); + _is_prepared = true; + } } -- cgit v1.2.1