From 593c2425e6b94828fb486244e42c275a89a71aff Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Wed, 11 Aug 2021 14:06:28 +0100
Subject: Fix performance regression due to clFinish()

- In ClGemmLowpMatrixMultiplyCore::prepare we always called clFinish()
  also when the workload was already prepared

Resolves COMPMID-4707

Change-Id: Icdcee528590e2c5efb75325a80c2a45ec84993d1
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6082
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
---
 src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 64c8743f13..0c72912642 100644
--- a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -773,9 +773,9 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
                 shifts_tensor->unmap(CLScheduler::get().queue());
             }
         }
+        CLScheduler::get().queue().finish();
         _is_prepared = true;
     }
-    CLScheduler::get().queue().finish();
 }
 
 experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const
-- 
cgit v1.2.1