21 files changed, 122 insertions, 36 deletions
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 82969301b0..7767b73e10 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -94,12 +94,14 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     CLMemoryGroup                _memory_group;
     CLDeconvolutionLayerUpsample _scale_f;
     CLConvolutionLayer           _conv_f;
     CLTensor                     _scaled_output;
+    bool                         _is_prepared;
 };
 }
 #endif /* __ARM_COMPUTE_CLDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index b1eb4b9e04..229fb24010 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -140,6 +140,7 @@ public:
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
     CLDepthwiseIm2ColKernel                   _im2col_kernel;
@@ -153,7 +154,7 @@ private:
     CLTensor                                  _weights_reshaped;
     CLTensor                                  _v2mm_output;
     CLTensor                                  _output_reshaped;
-    bool                                      _is_first_run;
+    bool                                      _is_prepared;
     bool                                      _is_quantized;
     const ICLTensor                          *_original_weights;
 };
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
index 27cee5ed3b..a43461048a 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,6 +70,7 @@ public:
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
     CLDepthwiseConvolutionLayer _depthwise_conv;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index aaa432616d..3dde52989b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -195,7 +195,6 @@ private:
     bool _is_quantized;
     bool _is_activationlayer_enabled;
     bool _is_prepared;
-    bool _retain_internal_weights;
 };
 }
 #endif /* __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 3976704907..f404ccdf4c 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,14 @@ class CLGEMMLowpMatrixMultiplyCore : public IFunction
 public:
     /** Constructor */
     CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyCore(const CLGEMMLowpMatrixMultiplyCore &) = delete;
+    /** Default move constructor */
+    CLGEMMLowpMatrixMultiplyCore(CLGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyCore &operator=(const CLGEMMLowpMatrixMultiplyCore &) = delete;
+    /** Default move assignment operator */
+    CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default;
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
@@ -83,6 +91,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     CLMemoryGroup                      _memory_group;
@@ -96,11 +105,12 @@ private:
     CLTensor                           _vector_sum_row;
     CLTensor                           _tmp_a;
     CLTensor                           _tmp_b;
+    const ICLTensor                   *_original_b;
     int32_t                            _a_offset;
     int32_t                            _b_offset;
     bool                               _is_interleaved_transposed;
-    bool                               _is_first_run;
     bool                               _reshape_b_only_on_first_run;
+    bool                               _is_prepared;
 };
 }
 #endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index b7b2587454..c2bb47c550 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -90,6 +90,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     CLMemoryGroup                          _memory_group;
@@ -100,7 +101,7 @@ private:
     CLTensor                               _input_im2col_reshaped;
     CLTensor                               _weights_reshaped;
     CLTensor                               _gemm_output;
-    bool                                   _is_first_run;
+    bool                                   _is_prepared;
     const ICLTensor                       *_original_weights;
 };
 }
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 9f239a9e64..ab7407dbfc 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -69,6 +69,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     CLMemoryGroup              _memory_group;
@@ -80,6 +81,7 @@ private:
     CLTensor                   _fully_connected_out;
     CLTensor                   _gemm_output;
     CLTensor                   _add_output;
+    bool                       _is_prepared;
 };
 }
 #endif /* __ARM_COMPUTE_CLRNN_LAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
index fa29f447c8..45a883948c 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
@@ -63,7 +63,6 @@ public:
 
 private:
     GCWeightsReshapeKernel _weights_reshape_kernel;
-    GCTensor               _weights_reshaped;
 };
 
 /** Basic function to compute the convolution layer. This function calls the following GLES kernels:
@@ -128,6 +127,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     /** Configures the appropriate matrix multiply routine
@@ -166,8 +166,8 @@ private:
     GCTensor _gemm_output;
     GCTensor _tmp_output;
 
-    bool _is_first_run;
     bool _is_activationlayer_enabled;
+    bool _is_prepared;
 };
 }
 
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
index 1f8dc3e1a0..cd108c3eab 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
@@ -65,6 +65,14 @@ class GCFullyConnectedLayer : public IFunction
 public:
     /** Constructor */
     GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCFullyConnectedLayer(const GCFullyConnectedLayer &) = delete;
+    /** Default move constructor */
+    GCFullyConnectedLayer(GCFullyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCFullyConnectedLayer &operator=(const GCFullyConnectedLayer &) = delete;
+    /** Default move assignment operator */
+    GCFullyConnectedLayer &operator=(GCFullyConnectedLayer &&) = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input                   Source tensor. Data type supported: F16/F32.
@@ -81,6 +89,7 @@ public:
 
     //Inherited methods override
     void run() override;
+    void prepare() override;
 
 private:
     void configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
@@ -93,6 +102,7 @@ private:
     GCGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
     GCTensor                            _im2col_output;
     GCTensor                            _reshape_weights_output;
+    const IGCTensor                    *_original_weights;
     bool                                _are_weights_reshaped;
     bool                                _is_fc_after_conv;
     bool                                _accumulate_biases;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
index a1d6c8a438..2db254527f 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
@@ -50,7 +50,14 @@ class GCGEMM : public IFunction
 public:
     /** Default constructor. */
     GCGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMM(const GCGEMM &) = delete;
+    /** Default move constructor */
+    GCGEMM(GCGEMM &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMM &operator=(const GCGEMM &) = delete;
+    /** Default move assignment operator */
+    GCGEMM &operator=(GCGEMM &&) = default;
     /** Initialise the kernel's inputs and output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -86,6 +93,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     GCMemoryGroup              _memory_group;
@@ -95,10 +103,11 @@ private:
     GCGEMMMatrixAdditionKernel _ma_kernel;
     GCTensor                   _tmp_a;
     GCTensor                   _tmp_b;
+    const IGCTensor           *_original_b;
     bool                       _is_interleaved_transposed;
     bool                       _run_addition;
-    bool                       _is_first_run;
     bool                       _reshape_b_only_on_first_run;
+    bool                       _is_prepared;
 };
 }
 
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index 3aa43ec96e..c4ba1a584e 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -51,7 +51,7 @@ public:
     using TypeResult = TypeOutput;
     /** Default constructor. */
     AssemblyKernelGlue()
-        : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr)
+        : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr), _is_prepared(false)
     {
     }
     /** Assembly Gemm */
@@ -76,6 +76,31 @@ public:
     ITensor *_workspace;
     /** Pre-transpose tensor */
     ITensor *_pretranspose;
+    /** Prepared flag */
+    bool _is_prepared;
+
+    /** Runs a preparation step, usually for pre-transposing matrix b */
+    void prepare()
+    {
+        // Pretranspose B if required
+        if(_gemm_kernel_asm->B_pretranspose_required())
+        {
+            const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer());
+            const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+            // Forcing 128-byte alignment (required by 32-bit kernels)
+            const unsigned int alignment   = 128;
+            void              *raw_ptr     = reinterpret_cast<void *>(_pretranspose->buffer());
+            size_t             space       = _pretranspose->info()->total_size();
+            void              *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
+            ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
+            _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
+            _b->mark_as_unused();
+        }
+
+        _is_prepared = true;
+    }
 
     /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel.
      *  The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2)
@@ -102,28 +127,25 @@ public:
         const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
         auto       out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer());
 
-        // Set workspace if needed
+        // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
         if(_workspace != nullptr)
         {
             _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace->buffer()));
+            const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+            unsigned int       num_threads = NEScheduler::get().num_threads();
+            if(window_size < num_threads)
+            {
+                num_threads = window_size;
+                _gemm_kernel_asm->set_nthreads(num_threads);
+            }
         }
 
+        // Prepare assembly kernel
+        prepare();
+
         // Set gemm parameters
         _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
 
-        // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
-        {
-            // Forcing 128-byte alignment (required by 32-bit kernels)
-            const unsigned int alignment   = 128;
-            void              *raw_ptr     = reinterpret_cast<void *>(_pretranspose->buffer());
-            size_t             space       = _pretranspose->info()->total_size();
-            void              *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
-            ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
-            _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
-            _b->mark_as_unused();
-        }
-
         // Schedule assembly kernel
         NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
     }
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index ff41f0c985..e143814a4e 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -112,6 +112,7 @@ public:
                                                     const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     std::shared_ptr<IMemoryManager> _memory_manager;
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 66c6d427ba..3e527168c1 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -108,6 +108,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup        _memory_group;
@@ -117,6 +118,7 @@ private:
     ITensor           *_input;
     PadStrideInfo      _info;
     std::pair<unsigned int, unsigned int> _inner_border;
+    bool _is_prepared;
 };
 } // arm_compute
 #endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index b80fb7f2c8..aa4cace7c2 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -122,6 +122,7 @@ public:
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
     NEDepthwiseIm2ColKernel                   _im2col_kernel;
@@ -135,7 +136,7 @@ private:
     Tensor                                    _weights_reshaped;
     Tensor                                    _v2mm_output;
     Tensor                                    _output_reshaped;
-    bool                                      _is_first_run;
+    bool                                      _is_prepared;
     bool                                      _is_quantized;
     const ITensor                            *_original_weights;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
index 0562c07515..99e93ccece 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,6 +70,7 @@ public:
 
     // Inherited methods overriden:
     void run() override;
+    void prepare() override;
 
 private:
     NEDepthwiseConvolutionLayer _depthwise_conv;
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 071eecc3f7..2739f5ebef 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -127,22 +127,23 @@ public:
 
     //Inherited methods override
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                         _memory_group;
     NEIm2ColKernel                      _im2col_kernel;
-    NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
     NEGEMMInterleave4x4Kernel           _interleave4x4_kernel;
     NEGEMMMatrixMultiplyKernel          _mm_kernel;
     NEGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
     Tensor                              _im2col_output;
     Tensor                              _interleave4x4_output;
     Tensor                              _reshape_weights_output;
-    bool                                _are_weights_reshaped;
+    const ITensor                      *_original_weights;
     bool                                _is_batched_fc_layer;
     bool                                _linearize_input;
     bool                                _accumulate_biases;
-    const ITensor                      *_original_weights;
+    bool                                _is_prepared;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index e2263c2307..5d108b2c14 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -53,7 +53,14 @@ class NEGEMM : public IFunction
 public:
     /** Constructor */
     NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMM(const NEGEMM &) = delete;
+    /** Default move constructor */
+    NEGEMM(NEGEMM &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMM &operator=(const NEGEMM &) = delete;
+    /** Default move assignment operator */
+    NEGEMM &operator=(NEGEMM &&) = default;
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -72,6 +79,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                _memory_group;
@@ -84,10 +92,11 @@ private:
     Tensor                     _tmp_b;
     Tensor                     _workspace;
     Tensor                     _B_pretransposed;
+    const ITensor             *_original_b;
     bool                       _run_vector_matrix_multiplication;
     bool                       _run_addition;
-    bool                       _is_first_run;
     bool                       _reshape_b_only_on_first_run;
+    bool                       _is_prepared;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index d64fd9e771..7075becf75 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -153,6 +153,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     /** Configures the appropriate matrix multiply routine
@@ -197,6 +198,7 @@ private:
     bool       _is_interleaved;
     bool       _is_activationlayer_enabled;
     bool       _skip_im2col;
+    bool       _is_prepared;
 };
 }
 #endif /* __ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index adcddb8263..f32eb3c757 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -56,6 +56,14 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
 public:
     /** Constructor */
     NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+    /** Default move constructor */
+    NEGEMMLowpMatrixMultiplyCore(NEGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+    /** Default move assignment operator */
+    NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default;
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
@@ -86,6 +94,7 @@ public:
 
     // Inherited methods overridden
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                        _memory_group;
@@ -103,12 +112,13 @@ private:
     Tensor                             _tmp_b;
     Tensor                             _workspace;
     Tensor                             _B_pretranspose;
+    const ITensor                     *_original_b;
     int32_t                            _a_offset;
     int32_t                            _b_offset;
     bool                               _run_vector_matrix_multiplication;
     bool                               _dot_product_path;
-    bool                               _is_first_run;
     bool                               _reshape_b_only_on_first_run;
+    bool                               _is_prepared;
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index 18cd27414e..7d1f124bb3 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -90,6 +90,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                            _memory_group;
@@ -100,7 +101,7 @@ private:
     Tensor                                 _input_im2col_reshaped;
     Tensor                                 _weights_reshaped;
     Tensor                                 _gemm_output;
-    bool                                   _is_first_run;
+    bool                                   _is_prepared;
     const ITensor                         *_original_weights;
 };
 }
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 55921f78f3..c1260977c0 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -74,6 +74,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
@@ -122,7 +123,7 @@ private:
     const ITensor *_input;
     const ITensor *_weights;
     ITensor       *_output;
-    bool           _reshaped_kernel;
+    bool           _is_prepared;
     bool           _is_activationlayer_enabled;
 };
 }