1 files changed, 25 insertions, 15 deletions
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 5ff4fbceee..025a16b4fb 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -69,7 +69,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
-      _is_first_run(true), _is_activationlayer_enabled(false)
+      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
 {
 }
 
@@ -97,6 +97,9 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
                                                     conv_info,
                                                     input->info()->data_layout());
 
+    _is_prepared      = false;
+    _original_weights = weights;
+
     // Manage intermediate tensors
     _memory_group.manage(&_input0);
     _memory_group.manage(&_batched_mm_output);
@@ -124,7 +127,6 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
-    _input1.allocator()->allocate();
     _batched_mm_output.allocator()->allocate();
 }
 
@@ -182,11 +184,7 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
 
 void CLWinogradConvolutionLayer::run()
 {
-    if(_is_first_run)
-    {
-        // Run filter transform
-        CLScheduler::get().enqueue(_filter_transform, false);
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -196,13 +194,6 @@ void CLWinogradConvolutionLayer::run()
     // Run batched matrix multiplication
     _batched_mm.run();
 
-    // Release reshaped weights if marked unused by CLGEMM
-    if(_is_first_run && !_input1.is_used())
-    {
-        CLScheduler::get().queue().finish();
-        _input1.allocator()->free();
-    }
-
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
@@ -212,6 +203,25 @@ void CLWinogradConvolutionLayer::run()
     }
 
     _memory_group.release();
+}
+
+void CLWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run filter transform and mark original weights as unused
+        _input1.allocator()->allocate();
+        CLScheduler::get().enqueue(_filter_transform, false);
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+        _batched_mm.prepare();
+        if(!_input1.is_used())
+        {
+            _input1.allocator()->free();
+        }
 
-    _is_first_run = false;
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
 }