From e043767d068da389308507011d944e6db9e4d676 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 2 May 2018 14:07:55 +0100
Subject: COMPMID-920: Introduce prepare() stage

Change-Id: I08ddb7f6e061178e7566518b48e4e18f8f078596
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129825
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/graph/Workload.h                       | 11 ++--
 arm_compute/graph/detail/ExecutionHelpers.h        | 21 ++++++++
 .../runtime/CL/functions/CLConvolutionLayer.h      |  3 +-
 .../runtime/CL/functions/CLFullyConnectedLayer.h   |  1 +
 arm_compute/runtime/CL/functions/CLGEMM.h          |  3 +-
 .../runtime/CL/functions/CLGEMMConvolutionLayer.h  |  3 +-
 .../CL/functions/CLWinogradConvolutionLayer.h      | 12 ++++-
 arm_compute/runtime/IFunction.h                    | 14 +++--
 examples/graph_vgg19.cpp                           |  6 +--
 src/graph/GraphManager.cpp                         | 48 ++++++++++++-----
 src/graph/Workload.cpp                             |  8 +++
 src/graph/detail/ExecutionHelpers.cpp              | 63 ++++++++++++++++++++--
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |  6 +++
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 46 +++++++++-------
 src/runtime/CL/functions/CLGEMM.cpp                | 39 ++++++++------
 .../CL/functions/CLGEMMConvolutionLayer.cpp        | 52 +++++++++---------
 .../CL/functions/CLWinogradConvolutionLayer.cpp    | 40 ++++++++------
 17 files changed, 275 insertions(+), 101 deletions(-)
diff --git a/arm_compute/graph/Workload.h b/arm_compute/graph/Workload.h
index b19c932636..11bb22ea9a 100644
--- a/arm_compute/graph/Workload.h
+++ b/arm_compute/graph/Workload.h
@@ -37,6 +37,7 @@ namespace graph
 class ITensorHandle;
 class INode;
 class Tensor;
+class Graph;
 
 /** Execution task
  *
@@ -52,14 +53,18 @@ struct ExecutionTask
 
     /** Function operator */
     void operator()();
+
+    /** Prepare execution task */
+    void prepare();
 };
 
 /** Execution workload */
 struct ExecutionWorkload
 {
-    std::vector<Tensor *>      inputs  = {}; /**< Input handles */
-    std::vector<Tensor *>      outputs = {}; /**< Output handles */
-    std::vector<ExecutionTask> tasks   = {}; /**< Execution workload */
+    std::vector<Tensor *>      inputs  = {};      /**< Input handles */
+    std::vector<Tensor *>      outputs = {};      /**< Output handles */
+    std::vector<ExecutionTask> tasks   = {};      /**< Execution workload */
+    Graph                     *graph   = nullptr; /**< Graph bound to the workload */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/detail/ExecutionHelpers.h b/arm_compute/graph/detail/ExecutionHelpers.h
index 52304d6836..a868df8a5d 100644
--- a/arm_compute/graph/detail/ExecutionHelpers.h
+++ b/arm_compute/graph/detail/ExecutionHelpers.h
@@ -35,6 +35,7 @@ class Graph;
 class GraphContext;
 class ExecutionWorkload;
 class Tensor;
+class INode;
 
 namespace detail
 {
@@ -45,6 +46,21 @@ void default_initialize_backends();
  * @param[in] g Graph to configure
  */
 void configure_all_tensors(Graph &g);
+/** Allocates all input tensors of a node.
+ *
+ * @param[in] node Node to allocate the input tensor of
+ */
+void allocate_all_input_tensors(INode &node);
+/** Allocates all output tensors of a node.
+ *
+ * @param[in] node Node to allocate the output tensor of
+ */
+void allocate_all_output_tensors(INode &node);
+/** Allocates const tensor of a given graph
+ *
+ * @param[in] g Graph to allocate the tensors
+ */
+void allocate_const_tensors(Graph &g);
 /** Allocates all tensors of a graph
  *
  * @param[in] g Graph to allocate the tensors
@@ -88,6 +104,11 @@ void call_all_input_node_accessors(ExecutionWorkload &workload);
  * @param[in] workload Workload to execute
  */
 void call_all_output_node_accessors(ExecutionWorkload &workload);
+/** Prepares all tasks for execution
+ *
+ * @param[in] workload Workload to prepare
+ */
+void prepare_all_tasks(ExecutionWorkload &workload);
 /** Executes all tasks of a workload
  *
  * @param[in] workload Workload to execute
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 5c05334a56..97998b5595 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -106,10 +106,11 @@ public:
                                                     const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     std::shared_ptr<IMemoryManager> _memory_manager;
-    std::unique_ptr<IFunction>      _function; /**< Function to run */
+    std::unique_ptr<IFunction>      _function;
 };
 }
 #endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 67c0467f3a..7fb5af9229 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -109,6 +109,7 @@ public:
 
     //Inherited methods override
     void run() override;
+    void prepare() override;
 
 private:
     void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index c5d7b86384..60ff32c6fa 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -100,6 +100,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     CLMemoryGroup              _memory_group;
@@ -112,8 +113,8 @@ private:
     const ICLTensor           *_original_b;
     bool                       _is_interleaved_transposed;
     bool                       _run_addition;
-    bool                       _is_first_run;
     bool                       _reshape_b_only_on_first_run;
+    bool                       _is_prepared;
 };
 }
 
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index a24ac3ac1f..3dde52989b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -153,6 +153,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     /** Configures the appropriate matrix multiply routine
@@ -192,8 +193,8 @@ private:
     CLTensor _tmp_output;
 
     bool _is_quantized;
-    bool _is_first_run;
     bool _is_activationlayer_enabled;
+    bool _is_prepared;
 };
 }
 #endif /* __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index a27976959c..594d6028e1 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -49,6 +49,14 @@ class CLWinogradConvolutionLayer : public IFunction
 public:
     /** Default constructor */
     CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradConvolutionLayer(const CLWinogradConvolutionLayer &) = delete;
+    /** Default move constructor */
+    CLWinogradConvolutionLayer(CLWinogradConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWinogradConvolutionLayer &operator=(const CLWinogradConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    CLWinogradConvolutionLayer &operator=(CLWinogradConvolutionLayer &&) = default;
     /** Set the input and output tensors.
      *
      * @note: This function only works with 3x3 and 5x5 kernels along with unit strides
@@ -92,6 +100,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     CLMemoryGroup                   _memory_group;
@@ -103,7 +112,8 @@ private:
     CLTensor                        _input0;
     CLTensor                        _input1;
     CLTensor                        _batched_mm_output;
-    bool                            _is_first_run;
+    const ICLTensor                *_original_weights;
+    bool                            _is_prepared;
     bool                            _is_activationlayer_enabled;
 };
 }
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index a4e7ed15e0..f64b2be998 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@ namespace arm_compute
 class IFunction
 {
 public:
+    /** Destructor */
+    virtual ~IFunction() = default;
     /** Run the kernels contained in the function
      *
      * For NEON kernels:
@@ -43,12 +45,18 @@ public:
      * - The queue is then flushed.
      *
      * @note The function will not block until the kernels are executed. It is the user's responsibility to wait.
+     * @note Will call prepare() on first run if hasn't been done
      */
     virtual void run() = 0;
-    /** Destructor
+    /** Prepare the function for executing
+     *
+     * Any one off pre-processing step required by the function is handled here
      *
+     * @note Prepare stage might not need all the function's buffers' backing memory to be available in order to execute
      */
-    virtual ~IFunction() = default;
+    virtual void prepare()
+    {
+    }
 };
 }
 #endif /*__ARM_COMPUTE_IFUNCTION_H__ */
diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp
index 606b0e0142..28e1a0fe04 100644
--- a/examples/graph_vgg19.cpp
+++ b/examples/graph_vgg19.cpp
@@ -51,9 +51,8 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Set target. 0 (NEON), 1 (OpenCL), 2 (OpenCL with Tuner). By default it is NEON
-        const int         target           = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0;
-        Target            target_hint      = set_target_hint(target);
-        ConvolutionMethod convolution_hint = ConvolutionMethod::DIRECT;
+        const int target      = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0;
+        Target    target_hint = set_target_hint(target);
 
         // Parse arguments
         if(argc < 2)
@@ -88,7 +87,6 @@ public:
         }
 
         graph << target_hint
-              << convolution_hint
               << InputLayer(TensorDescriptor(TensorShape(224U, 224U, 3U, 1U), DataType::F32),
                             get_input_accessor(image, std::move(preprocessor)))
               // Layer 1
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index c0720ac685..fa7dfdf8f8 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -74,23 +74,47 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
     auto workload = detail::configure_all_nodes(graph, ctx);
     ARM_COMPUTE_ERROR_ON_MSG(workload.tasks.empty(), "Could not configure all nodes!");
 
-    // Allocate all tensors
-    detail::allocate_all_tensors(graph);
+    // TODO (COMPMID-920) : Update prepare for NEON/GC
+    if(forced_target == Target::CL)
+    {
+        // Allocate const tensors and call accessors
+        detail::allocate_const_tensors(graph);
+        detail::call_all_const_node_accessors(graph);
+
+        // Prepare graph
+        detail::prepare_all_tasks(workload);
+
+        // Allocate all tensors
+        detail::allocate_all_tensors(graph);
+
+        // Finalize Graph context
+        ctx.finalize();
 
-    // Call accessors on all Const nodes
-    detail::call_all_const_node_accessors(graph);
+        // Register graph
+        _workloads.insert(std::make_pair(graph.id(), std::move(workload)));
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
+    }
+    else
+    {
+        // Allocate all tensors
+        detail::allocate_all_tensors(graph);
 
-    _workloads.insert(std::make_pair(graph.id(), std::move(workload)));
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
+        // Call accessors on all Const nodes
+        detail::call_all_const_node_accessors(graph);
 
-    // Finalize Graph context
-    ctx.finalize();
+        // Finalize Graph context
+        ctx.finalize();
 
-    // Make first run
-    execute_graph(graph);
+        // Register graph
+        _workloads.insert(std::make_pair(graph.id(), std::move(workload)));
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
 
-    // Release all unused const nodes
-    detail::release_unused_tensors(graph);
+        // Make first run
+        execute_graph(graph);
+
+        // Release all unused const tensors
+        detail::release_unused_tensors(graph);
+    }
 }
 
 void GraphManager::execute_graph(Graph &graph)
diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp
index c53a8a42da..f350bbf625 100644
--- a/src/graph/Workload.cpp
+++ b/src/graph/Workload.cpp
@@ -37,5 +37,13 @@ void ExecutionTask::operator()()
         task->run();
     }
 }
+
+void ExecutionTask::prepare()
+{
+    if(task)
+    {
+        task->prepare();
+    }
+}
 } // namespace graph
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index 5a50728164..0bb47f2b33 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -61,15 +61,61 @@ void configure_all_tensors(Graph &g)
     }
 }
 
+void allocate_all_input_tensors(INode &node)
+{
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        Tensor *tensor = node.input(i);
+        if(tensor != nullptr && !tensor->bound_edges().empty())
+        {
+            ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
+            tensor->handle()->allocate();
+        }
+    }
+}
+
+void allocate_all_output_tensors(INode &node)
+{
+    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    {
+        Tensor *tensor = node.output(i);
+        if(tensor != nullptr && !tensor->bound_edges().empty())
+        {
+            ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
+            tensor->handle()->allocate();
+        }
+    }
+}
+
+void allocate_const_tensors(Graph &g)
+{
+    for(auto &node : g.nodes())
+    {
+        if(node != nullptr)
+        {
+            switch(node->type())
+            {
+                case NodeType::Const:
+                case NodeType::Input:
+                    allocate_all_output_tensors(*node);
+                    break;
+                case NodeType::Output:
+                    allocate_all_input_tensors(*node);
+                default:
+                    break;
+            }
+        }
+    }
+}
+
 void allocate_all_tensors(Graph &g)
 {
     auto &tensors = g.tensors();
 
     for(auto &tensor : tensors)
     {
-        if(tensor && !tensor->bound_edges().empty())
+        if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
         {
-            ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
         }
     }
@@ -96,7 +142,8 @@ void validate_all_nodes(Graph &g)
 ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx)
 {
     ExecutionWorkload workload;
-    auto             &nodes = g.nodes();
+    workload.graph = &g;
+    auto &nodes    = g.nodes();
 
     // Create tasks
     for(auto &node : nodes)
@@ -176,6 +223,16 @@ void call_all_input_node_accessors(ExecutionWorkload &workload)
     }
 }
 
+void prepare_all_tasks(ExecutionWorkload &workload)
+{
+    ARM_COMPUTE_ERROR_ON(workload.graph == nullptr);
+    for(auto &task : workload.tasks)
+    {
+        task.prepare();
+        release_unused_tensors(*workload.graph);
+    }
+}
+
 void call_all_tasks(ExecutionWorkload &workload)
 {
     for(auto &task : workload.tasks)
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 83281e1747..3d4fb113b2 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -135,5 +135,11 @@ ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *
 
 void CLConvolutionLayer::run()
 {
+    prepare();
     _function->run();
 }
+
+void CLConvolutionLayer::prepare()
+{
+    _function->prepare();
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 9b3bf48bca..151fa1b5fa 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -220,13 +220,6 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
     }
-
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped)
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
 }
 
 Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
@@ -311,17 +304,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
 
 void CLFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -356,3 +339,30 @@ void CLFullyConnectedLayer::run()
 
     _memory_group.release();
 }
+
+void CLFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape weights kernel and mark weights as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_kernel.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_reshape_weights_output.is_used())
+            {
+                _reshape_weights_output.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _are_weights_reshaped = true;
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 37fa0c5ba2..e735adba39 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -98,7 +98,7 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLT
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
-      _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -114,6 +114,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = false;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -169,7 +170,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
     {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
     // Configure matrix addition kernel
@@ -188,6 +192,8 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTen
 
 void CLGEMM::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -195,18 +201,7 @@ void CLGEMM::run()
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
 
-        if(_is_first_run)
-        {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
-
-            // Mark original b matrix as unused
-            if(_reshape_b_only_on_first_run)
-            {
-                _original_b->mark_as_unused();
-            }
-        }
-        else if(!_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
             CLScheduler::get().enqueue(_transpose_kernel, false);
@@ -223,6 +218,20 @@ void CLGEMM::run()
     }
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            // Run transpose kernel
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_transpose_kernel, false);
+            _original_b->mark_as_unused();
+        }
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index cf8a6a8a78..610eec4d67 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,7 +91,7 @@ void CLConvolutionLayerReshapeWeights::run()
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true), _is_activationlayer_enabled(false)
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -165,7 +165,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
                                                                 dilation,
                                                                 act_info));
 
-    _is_first_run     = true;
+    _is_prepared      = false;
     _original_weights = weights;
     _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
@@ -258,9 +258,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    _weights_reshaped.allocator()->allocate();
-
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
 
@@ -305,7 +302,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     unsigned int mat_weights_cols = weights->dimension(3);
     unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized? nullptr:biases, nullptr));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
 
     // Create tensor info for im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
@@ -369,16 +366,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
 
 void CLGEMMConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _reshape_weights.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -398,13 +386,6 @@ void CLGEMMConvolutionLayer::run()
     {
         // Run gemm
         _mm_gemm.run();
-
-        // Release reshaped weights if marked unused by CLGEMM
-        if(_is_first_run && !_weights_reshaped.is_used())
-        {
-            CLScheduler::get().queue().finish();
-            _weights_reshaped.allocator()->free();
-        }
     }
 
     // Reshape output matrix
@@ -417,6 +398,29 @@ void CLGEMMConvolutionLayer::run()
     }
 
     _memory_group.release();
+}
 
-    _is_first_run = false;
+void CLGEMMConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run weights reshaping and mark as unused
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Run GEMM prepare
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_weights_reshaped.is_used())
+            {
+                _weights_reshaped.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 5ff4fbceee..025a16b4fb 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -69,7 +69,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
-      _is_first_run(true), _is_activationlayer_enabled(false)
+      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
 {
 }
 
@@ -97,6 +97,9 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
                                                     conv_info,
                                                     input->info()->data_layout());
 
+    _is_prepared      = false;
+    _original_weights = weights;
+
     // Manage intermediate tensors
     _memory_group.manage(&_input0);
     _memory_group.manage(&_batched_mm_output);
@@ -124,7 +127,6 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
-    _input1.allocator()->allocate();
     _batched_mm_output.allocator()->allocate();
 }
 
@@ -182,11 +184,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
 
 void CLWinogradConvolutionLayer::run()
 {
-    if(_is_first_run)
-    {
-        // Run filter transform
-        CLScheduler::get().enqueue(_filter_transform, false);
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -196,13 +194,6 @@ void CLWinogradConvolutionLayer::run()
     // Run batched matrix multiplication
     _batched_mm.run();
 
-    // Release reshaped weights if marked unused by CLGEMM
-    if(_is_first_run && !_input1.is_used())
-    {
-        CLScheduler::get().queue().finish();
-        _input1.allocator()->free();
-    }
-
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
@@ -212,6 +203,25 @@ void CLWinogradConvolutionLayer::run()
     }
 
     _memory_group.release();
+}
+
+void CLWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run filter transform and mark original weights as unused
+        _input1.allocator()->allocate();
+        CLScheduler::get().enqueue(_filter_transform, false);
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+        _batched_mm.prepare();
+        if(!_input1.is_used())
+        {
+            _input1.allocator()->free();
+        }
 
-    _is_first_run = false;
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }
-- 
cgit v1.2.1