Move CPU/GPU files from Core/Runtime to the respective backend folders

Legacy structure contained two libraries core/runtime with two backends in each. We reduce the core/runtime libraries to a single library thus merging the backend files Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I69545765fe7a730368105cdbd067d3135ec7a174 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6155 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2021-08-20 21:39:25 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-08-25 16:23:15 +0000
commit: 7891a73ef36f4ad7b71069b3c57694f85bb79454 (patch)
tree: 5b08692989e28ce63de2937d8d92ea5176589dbe /src/runtime
parent: a46c9c98c2b1d70acc7c6eee00e2cdc2a1e209a6 (diff)
download: ComputeLibrary-7891a73ef36f4ad7b71069b3c57694f85bb79454.tar.gz
208 files changed, 89 insertions, 18502 deletions
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9c71b2aa7d..bf69868663 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClActivation.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 53256ebed4..f9403afcb8 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCast.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index ea96e45bf8..8ab50beacd 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 8189eee402..7780c0a444 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index b295a274bd..1f715d246d 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClConv2d.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
 #include "support/Cast.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 98916bf38a..e8aaf85876 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCopy.h"
+#include "src/gpu/cl/operators/ClCopy.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 20cab4df5f..ff30837506 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCrop.h"
+#include "src/gpu/cl/operators/ClCrop.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 6aa370b23c..5930ff19f8 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClCast.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 3b104017e7..e11802ed1b 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClDequantize.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 907e69d8d7..7bbb7e8e4a 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -28,8 +28,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/runtime/gpu/cl/operators/ClActivation.h"
-#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 60c699cbb8..936b37fb31 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -28,9 +28,9 @@
 #include "arm_compute/core/Types.h"
 #include "src/core/CL/ICLKernel.h"
 
-#include "src/runtime/gpu/cl/operators/ClAdd.h"
-#include "src/runtime/gpu/cl/operators/ClElementwiseOperations.h"
-#include "src/runtime/gpu/cl/operators/ClSub.h"
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClElementwiseOperations.h"
+#include "src/gpu/cl/operators/ClSub.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index a45dd6f9a6..9dcd2d1891 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClElementwiseUnary.h"
+#include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index b22d79fea4..6019a84aba 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClFill.h"
+#include "src/gpu/cl/operators/ClFill.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 9563055276..32fc37552c 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/gpu/cl/operators/ClFlatten.h"
+#include "src/gpu/cl/operators/ClFlatten.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 4c5e482b10..8739e1803e 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClFloor.h"
+#include "src/gpu/cl/operators/ClFloor.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 4f9759c590..02b2042a6c 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClFullyConnected.h"
+#include "src/gpu/cl/operators/ClFullyConnected.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 14b0633e09..cc6689c504 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -32,7 +32,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemm.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 563dbd414f..837527bac3 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -32,7 +32,7 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClGemmConv2d.h"
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
 
 #include <cmath>
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 1ae2dfbad6..d9029478a1 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -37,7 +37,7 @@
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "src/core/helpers/MemoryHelpers.h"
 
-#include "src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 94d4c33fa2..6feed0d713 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -32,7 +32,7 @@
 #include "arm_compute/core/Types.h"
 
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.h"
+#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h"
 
 #include <algorithm>
 
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 9754bdcb82..0122162073 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 98c98abed5..306957a8d1 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index 388d2bce86..a0504d7852 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClLogicalNot.h"
+#include "src/gpu/cl/operators/ClLogicalNot.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index 897963ab50..63524213f7 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index bb7aff218d..186e7b4ba2 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPRelu.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/operators/ClPRelu.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index c1da2a9eca..556e943152 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
+#include "src/gpu/cl/operators/ClPermute.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 932659268d..9d91e58367 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClMul.h"
+#include "src/gpu/cl/operators/ClMul.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 7ba911c342..0ebce318fa 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClPool2d.h"
+#include "src/gpu/cl/operators/ClPool2d.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 5df895a91c..6ddf555b5c 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -32,8 +32,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index e6451b2eb4..b249bdd1db 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClQuantize.h"
+#include "src/gpu/cl/operators/ClQuantize.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 060eddb96c..c51a3298c1 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClReshape.h"
+#include "src/gpu/cl/operators/ClReshape.h"
 
 /** [CLReshapeLayer snippet] **/
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index cbd93c1086..5b78989bfa 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClScale.h"
+#include "src/gpu/cl/operators/ClScale.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index de58bf1b02..d52352fc8d 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -28,10 +28,10 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
-#include "src/runtime/gpu/cl/operators/ClSoftmax.h"
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
+#include "src/gpu/cl/operators/ClSoftmax.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index 142cf73259..e63c92eeb4 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/runtime/gpu/cl/operators/ClTranspose.h"
+#include "src/gpu/cl/operators/ClTranspose.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index fa01c914c5..b416d0fcf1 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h"
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
 #include "support/Cast.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
index 390bb97665..67253c7277 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index b799de6967..a64de9952e 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
index 982748810d..b3403b2aaf 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
index b8437487f8..b06c3b0f8e 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
@@ -27,11 +27,11 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 #include "src/runtime/CL/mlgo/Utils.h"
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 2b5c51fa5a..e48aede590 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuActivation.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 2e4755b949..a7581ca9f4 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuAdd.h"
+#include "src/cpu/operators/CpuAdd.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 0263d4cbb6..6fdd4267bf 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuSub.h"
+#include "src/cpu/operators/CpuSub.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index b519576ad5..a39e639ad4 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCast.h"
+#include "src/cpu/operators/CpuCast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index dcc5cd3a64..ceb697aad6 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "src/runtime/cpu/operators/CpuConcatenate.h"
+#include "src/cpu/operators/CpuConcatenate.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index 1f6b3c94e2..535ac99001 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 8bd1119a69..ca62a40cc8 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -28,11 +28,11 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuConv2d.h"
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
-#include "src/runtime/cpu/operators/CpuGemmConv2d.h"
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
-#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
+#include "src/cpu/operators/CpuConv2d.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index 20642b5eed..c2059e8e98 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCopy.h"
+#include "src/cpu/operators/CpuCopy.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 07e985c25e..1ec32074a5 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCast.h"
+#include "src/cpu/operators/CpuCast.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index daa5fd5ab9..ed6dec3850 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 91e37594af..83e0131c83 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -26,7 +26,7 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuDequantize.h"
+#include "src/cpu/operators/CpuDequantize.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 58530e4a8f..ef3d3d6055 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index 946bbb24b8..c958adf97c 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuElementwise.h"
+#include "src/cpu/operators/CpuElementwise.h"
 
 #include "arm_compute/core/ITensor.h"
 
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index 1a9e8839ca..a0674ec320 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
-#include "src/runtime/cpu/operators/CpuElementwiseUnary.h"
+#include "src/cpu/operators/CpuElementwiseUnary.h"
 #include <utility>
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index ee539fdfc8..43667783bf 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuFill.h"
+#include "src/cpu/operators/CpuFill.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 4d1054ad25..f435842634 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/cpu/operators/CpuFlatten.h"
+#include "src/cpu/operators/CpuFlatten.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index f8a3c13d6d..d2dc48a159 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuFloor.h"
+#include "src/cpu/operators/CpuFloor.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index cb7e2dc7ec..3f55a1f34e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuFullyConnected.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index b470afe1c6..58ade9fb3a 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemm.h"
 
 using namespace arm_compute::experimental;
 
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 2230e80e4b..42b8b70405 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 47ab16816a..c780d63763 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -28,7 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
 
 using namespace arm_compute::experimental;
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index b85530c70f..6c179f8387 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -31,7 +31,7 @@
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/helpers/MemoryHelpers.h"
 
-#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 
 using namespace arm_compute::experimental;
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 8351cc66d0..7e1de3c257 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index a05b545e9a..80c5690a4e 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuPRelu.h"
+#include "src/cpu/operators/CpuPRelu.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index f707fad757..517b86a1cb 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
+#include "src/cpu/operators/CpuPermute.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 3a2f1984b4..ad83a26beb 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuMul.h"
+#include "src/cpu/operators/CpuMul.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 8d267a32c0..5a3b9c5e7e 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuPool2d.h"
+#include "src/cpu/operators/CpuPool2d.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 946791a104..565c5531c4 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -32,8 +32,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index e607917615..dad246ac89 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,7 +26,7 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuQuantize.h"
+#include "src/cpu/operators/CpuQuantize.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index c0c78ea652..3ccb42361e 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuReshape.h"
+#include "src/cpu/operators/CpuReshape.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 0fbad07d0f..b952858181 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "src/core/utils/ScaleUtils.h"
-#include "src/runtime/cpu/operators/CpuScale.h"
+#include "src/cpu/operators/CpuScale.h"
 #include "support/Rounding.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index bee692c08b..0947ff94a6 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -25,10 +25,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/cpu/operators/CpuSoftmax.h"
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
+#include "src/cpu/operators/CpuSoftmax.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 3b3023f3b3..b6bf15e428 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 #include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuTranspose.h"
+#include "src/cpu/operators/CpuTranspose.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 98ff12590b..f0c153d4f4 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -29,9 +29,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
 
 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
diff --git a/src/runtime/cpu/ICpuOperator.h b/src/runtime/cpu/ICpuOperator.h
deleted file mode 100644
index 70ab4364c7..0000000000
--- a/src/runtime/cpu/ICpuOperator.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICPUOPERATOR_H
-#define ARM_COMPUTE_ICPUOPERATOR_H
-
-#include "arm_compute/runtime/NEON/INEOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using ICpuOperator = experimental::INEOperator;
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICPUOPERATOR_H */
diff --git a/src/runtime/cpu/operators/CpuActivation.cpp b/src/runtime/cpu/operators/CpuActivation.cpp
deleted file mode 100644
index 0b43b322ad..0000000000
--- a/src/runtime/cpu/operators/CpuActivation.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuActivation.h"
-
-#include "src/common/IOperator.h"
-#include "src/common/utils/LegacySupport.h"
-#include "src/core/cpu/kernels/CpuActivationKernel.h"
-#include "src/cpu/CpuContext.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
-{
-    auto k = std::make_unique<kernels::CpuActivationKernel>();
-    k->configure(input, output, activation_info);
-    _kernel = std::move(k);
-}
-
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
-{
-    return kernels::CpuActivationKernel::validate(input, output, activation_info);
-}
-
-std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
-{
-    TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
-    TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
-    auto       info     = detail::convert_to_activation_info(act);
-
-    if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
-    {
-        return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
-    }
-
-    auto act_op = std::make_unique<cpu::CpuActivation>();
-    act_op->configure(&src_info, &dst_info, info);
-
-    auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
-    {
-        ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
-        return std::make_tuple(nullptr, StatusCode::OutOfMemory);
-    }
-    op->set_internal_operator(std::move(act_op));
-
-    return std::make_tuple(op, StatusCode::Success);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuActivation.h b/src/runtime/cpu/operators/CpuActivation.h
deleted file mode 100644
index ded4a37edb..0000000000
--- a/src/runtime/cpu/operators/CpuActivation.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_H
-#define ARM_COMPUTE_CPU_ACTIVATION_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuActivationKernel */
-class CpuActivation : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  input           Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out] output          Destination tensor info. Data type supported: same as @p src
-     * @param[in]  activation_info Activation layer parameters.
-     */
-    void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuActivation::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/runtime/cpu/operators/CpuAdd.cpp b/src/runtime/cpu/operators/CpuAdd.cpp
deleted file mode 100644
index 23b09aca4f..0000000000
--- a/src/runtime/cpu/operators/CpuAdd.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuAdd.h"
-
-#include "src/core/cpu/kernels/CpuAddKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = std::make_unique<kernels::CpuAddKernel>();
-    k->configure(src0, src1, dst, policy);
-    _kernel = std::move(k);
-}
-
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuAdd.h b/src/runtime/cpu/operators/CpuAdd.h
deleted file mode 100644
index 3ff135fe41..0000000000
--- a/src/runtime/cpu/operators/CpuAdd.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ADD_H
-#define ARM_COMPUTE_CPU_ADD_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuAddKernel */
-class CpuAdd : public ICpuOperator
-{
-public:
-    /** Initialise the kernel's input, dst and border mode.
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]  src0     First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[in]  src1     Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[out] dst      The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in]  policy   Overflow policy.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     *
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuAdd::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ADD_H */
diff --git a/src/runtime/cpu/operators/CpuCast.cpp b/src/runtime/cpu/operators/CpuCast.cpp
deleted file mode 100644
index 5a4f6c518e..0000000000
--- a/src/runtime/cpu/operators/CpuCast.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuCast.h"
-
-#include "src/core/cpu/kernels/CpuCastKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
-    auto k = std::make_unique<kernels::CpuCastKernel>();
-    k->configure(src, dst, policy);
-    _kernel = std::move(k);
-}
-
-Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    return kernels::CpuCastKernel::validate(src, dst, policy);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuCast.h b/src/runtime/cpu/operators/CpuCast.h
deleted file mode 100644
index 26f5740b86..0000000000
--- a/src/runtime/cpu/operators/CpuCast.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CAST_H
-#define ARM_COMPUTE_CPU_CAST_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuCastKernel */
-class CpuCast : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * Input data type must be different than output data type.
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src            |dst                                             |
-     * |:--------------|:-----------------------------------------------|
-     * |QASYMM8_SIGNED | S16, S32, F32, F16                             |
-     * |QASYMM8        | U16, S16, S32, F32, F16                        |
-     * |U8             | U16, S16, S32, F32, F16                        |
-     * |U16            | U8, U32                                        |
-     * |S16            | QASYMM8_SIGNED, U8, S32                        |
-     * |F16            | QASYMM8_SIGNED, QASYMM8, F32, S32, U8          |
-     * |S32            | QASYMM8_SIGNED, QASYMM8, F16, F32, U8          |
-     * |F32            | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
-     *
-     * @param[in]  src    The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[out] dst    The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy Conversion policy.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuCast::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/runtime/cpu/operators/CpuConcatenate.cpp b/src/runtime/cpu/operators/CpuConcatenate.cpp
deleted file mode 100644
index bb475b790e..0000000000
--- a/src/runtime/cpu/operators/CpuConcatenate.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuConcatenate.h"
-
-#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis)
-{
-    ARM_COMPUTE_ERROR_ON(dst == nullptr);
-
-    _axis     = axis;
-    _num_srcs = srcs_vector.size();
-
-    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis));
-
-    unsigned int offset = 0;
-
-    for(unsigned int i = 0; i < _num_srcs; ++i)
-    {
-        switch(axis)
-        {
-            case Window::DimX:
-            {
-                auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>();
-                kernel->configure(srcs_vector.at(i), offset, dst);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            case Window::DimY:
-            {
-                auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>();
-                kernel->configure(srcs_vector.at(i), offset, dst);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            case Window::DimZ:
-            {
-                auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>();
-                kernel->configure(srcs_vector.at(i), offset, dst);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            case 3:
-            {
-                auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>();
-                kernel->configure(srcs_vector.at(i), offset, dst);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Axis not supported");
-        }
-        offset += srcs_vector.at(i)->dimension(axis);
-    }
-}
-
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
-
-    unsigned int offset = 0;
-    for(const auto &src : srcs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-        switch(axis)
-        {
-            case Window::DimX:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst));
-                break;
-            }
-            case Window::DimY:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst));
-                break;
-            }
-            case Window::DimZ:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst));
-                break;
-            }
-            case 3:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst));
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Axis not supported");
-        }
-        offset += src->dimension(axis);
-    }
-
-    if(dst->total_size() != 0)
-    {
-        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-
-void CpuConcatenate::run(ITensorPack &tensors)
-{
-    if(tensors.empty())
-    {
-        ARM_COMPUTE_ERROR("No inputs provided");
-    }
-
-    if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
-    {
-        ARM_COMPUTE_ERROR("Configured with different number of inputs");
-    }
-
-    int i = 0;
-    for(auto &k : _concat_kernels)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
-        pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
-        NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack);
-        ++i;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuConcatenate.h b/src/runtime/cpu/operators/CpuConcatenate.h
deleted file mode 100644
index 55eab54996..0000000000
--- a/src/runtime/cpu/operators/CpuConcatenate.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATE_H
-#define ARM_COMPUTE_CPU_CONCATENATE_H
-
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0).
- * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1).
- * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2).
- * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3).
- */
-class CpuConcatenate : public ICpuOperator
-{
-public:
-    CpuConcatenate() = default;
-    /** Configure operator for a given list of arguments
-     *
-     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel,
-     *       @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel.
-     *
-     * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out]    dst         Output tensor. Data types supported: Same as @p srcs_vector.
-     * @param[in]     axis        Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
-     */
-    void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConcatenate::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::vector<std::unique_ptr<ICpuKernel>> _concat_kernels{};
-    unsigned int                             _num_srcs{ 0 };
-    unsigned int                             _axis{ 0 };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */
diff --git a/src/runtime/cpu/operators/CpuConv2d.cpp b/src/runtime/cpu/operators/CpuConv2d.cpp
deleted file mode 100644
index cff9238308..0000000000
--- a/src/runtime/cpu/operators/CpuConv2d.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuConv2d.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
-#include "src/runtime/cpu/operators/CpuGemm.h"
-#include "src/runtime/cpu/operators/CpuGemmConv2d.h"
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
-#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuConv2d::CpuConv2d()
-    : _function()
-{
-}
-
-CpuConv2d::~CpuConv2d() = default;
-
-void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                          const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
-                                                   enable_fast_math, num_groups));
-
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
-    {
-        case ConvolutionMethod::WINOGRAD:
-        {
-            auto f = std::make_unique<CpuWinogradConv2d>();
-            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
-            _function = std::move(f);
-            break;
-        }
-        case ConvolutionMethod::GEMM:
-        {
-            auto f = std::make_unique<CpuGemmConv2d>();
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
-            _function = std::move(f);
-            break;
-        }
-        case ConvolutionMethod::GEMM_CONV2D:
-        {
-            auto f = std::make_unique<CpuGemmDirectConv2d>();
-            f->configure(input, weights, biases, output, info);
-            _function = std::move(f);
-            break;
-        }
-        case ConvolutionMethod::DIRECT:
-        {
-            auto f = std::make_unique<CpuDirectConv2d>();
-            f->configure(input, weights, biases, output, conv_info, act_info);
-            _function = std::move(f);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-    }
-
-    _aux_mem = _function->workspace();
-}
-
-Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
-
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
-    {
-        case ConvolutionMethod::WINOGRAD:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
-            break;
-        case ConvolutionMethod::GEMM:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
-            break;
-        case ConvolutionMethod::GEMM_CONV2D:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
-            break;
-        case ConvolutionMethod::DIRECT:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuDirectConv2d::validate(input, weights, biases, output, conv_info, act_info));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-    }
-
-    return Status{};
-}
-
-ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                    const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
-    ARM_COMPUTE_UNUSED(weights_info);
-
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
-
-    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
-    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
-    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
-
-    const std::vector<ConfigurationMethod> known_configs =
-    {
-        // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
-        // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
-        // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
-        // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
-    };
-
-    const auto find_config = [&](ConfigurationMethod c)
-    {
-        const ConvolutionConfiguration config = c.first;
-        const PadStrideInfo            info   = std::get<3>(config);
-
-        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
-    };
-
-    std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
-    {
-        return (*found).second;
-    }
-
-    if(dilation != Size2D(1U, 1U))
-    {
-        return ConvolutionMethod::GEMM;
-    }
-    else
-    {
-        // SRGAN
-        // Output might not be initialized when it is an internal tensor of the layer using the convolution
-        if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
-           && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
-        {
-            return ConvolutionMethod::DIRECT;
-        }
-        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
-        {
-            return ConvolutionMethod::FFT;
-        }
-        if(input->dimension(idx_c) < 16)
-        {
-            return ConvolutionMethod::GEMM;
-        }
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        // This heuristics only applies to F16 data type on A55r1
-        if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
-        {
-            // Exclude known bad winograd configs (and defaults to GEMM)
-            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
-            {
-                // Squeezenet_V1_1 fire2 and fire3
-                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
-                // Squeezenet_V1_1 fire6 and fire7
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
-                // Squeezenet_V1_1 fire8 and fire9
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
-            };
-            const auto find_conv_config = [&](ConvolutionConfiguration c)
-            {
-                const PadStrideInfo info = std::get<3>(c);
-
-                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-                       && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-                       && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
-            };
-
-            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
-                                          find_conv_config)
-                             != known_bad_winograd_f16_with_fastmath_configs.end();
-            if(found_bad)
-            {
-                return ConvolutionMethod::GEMM;
-            }
-        }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        // For 1x1 convolutions run the default GEMM
-        if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
-        {
-            return ConvolutionMethod::GEMM;
-        }
-
-        if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
-        {
-            return ConvolutionMethod::WINOGRAD;
-        }
-        if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
-        {
-            return ConvolutionMethod::GEMM_CONV2D;
-        }
-        return ConvolutionMethod::GEMM;
-    }
-}
-
-void CpuConv2d::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-    _function->run(tensors);
-}
-
-void CpuConv2d::prepare(ITensorPack &tensors)
-{
-    _function->prepare(tensors);
-}
-
-experimental::MemoryRequirements CpuConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuConv2d.h b/src/runtime/cpu/operators/CpuConv2d.h
deleted file mode 100644
index d7b42deea1..0000000000
--- a/src/runtime/cpu/operators/CpuConv2d.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to simulate a convolution layer. This function calls one of the following functions:
- * -# @ref CpuGemm     (executed only in case GEMM is required for the operation)
- * -# @ref CpuWinogradConv2d (executed only in case Winograd is required for the operation)
- * -# @ref CpuDirectConv2d   (executed only in case Direct Convolution is required for the operation)
- *
- *
- * The function selects one of the algorithms mentioned above based on:
- *      - The size of the kernel
- *      - Number of input/output feature maps
- *      - Amount of memory needed
- *
- * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
- *
- * FP32 Algorithm| Filter Size                                        |   Input/Output feature maps               |
- * --------------|----------------------------------------------------|-------------------------------------------|
- * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7        |  Input channels is greater than 3         |
- * FFT           | Squared kernels and greater than 9x9               |  Input feature maps > Output feature maps |
- * DirectConv    | 9x9                                                |                                           |
- * GEMM          | Any size                                           |                                           |
- *
- * Winograd 5x5 requires fast maths enabled.
- *
- * FP16 Algorithm| Filter Size      |
- * --------------|------------------|
- * Winograd      | Not supported    |
- * FFT           | Not supported    |
- * DirectConv    | 9x9              |
- * GEMM          | Any size         |
- *
- *
- */
-class CpuConv2d : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuConv2d();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConv2d);
-    /** Default destructor */
-    ~CpuConv2d();
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2   |dst            |
-     * |:--------------|:------------------|:------|:--------------|
-     * |F16            |F16                |F16    |F16            |
-     * |F32            |F32                |F32    |F32            |
-     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
-     *
-     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs.
-     *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                              Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                              Data types supported: Same as @p src.
-     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
-     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
-     *
-     * Similar to CpuConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
-    /** Static function to check if given info will return the convolution called by @ref CpuConv2d
-     *
-     * @param[in] src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p src.
-     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                             available which may introduce a drop of accuracy as well. Default is false
-     *
-     * @return the Convolution Method Hint
-     */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    std::unique_ptr<ICpuOperator>    _function;
-    experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp
deleted file mode 100644
index 3f2f4e95cf..0000000000
--- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
-    auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
-    k->configure(src, dst, original_src_shape, data_layout);
-    _kernel = std::move(k);
-}
-
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
-    return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
-}
-
-void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
-{
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
-}
-} // namesapce cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h
deleted file mode 100644
index 53ee17f6d1..0000000000
--- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H
-#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */
-class CpuConvertFullyConnectedWeights : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  src                Source tensor to permute. Data types supported: All
-     * @param[out] dst                Destintation tensor. Data types supported: Same as @p src
-     * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in]  data_layout        The data layout the weights have been trained in.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConvertFullyConnectedWeights::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/runtime/cpu/operators/CpuCopy.cpp b/src/runtime/cpu/operators/CpuCopy.cpp
deleted file mode 100644
index 9fbe916163..0000000000
--- a/src/runtime/cpu/operators/CpuCopy.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuCopy.h"
-
-#include "src/core/cpu/kernels/CpuCopyKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuCopyKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuCopyKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuCopy.h b/src/runtime/cpu/operators/CpuCopy.h
deleted file mode 100644
index 861bbb7849..0000000000
--- a/src/runtime/cpu/operators/CpuCopy.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_COPY_H
-#define ARM_COMPUTE_CPU_COPY_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuCopyKernel */
-class CpuCopy : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  src Source tensor info. Data type supported: All
-     * @param[out] dst Destination info. Data type supported: Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuCopy::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_COPY_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
deleted file mode 100644
index 8141487125..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
-                                info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
-                                info.pad_stride_info.pad_bottom());
-
-    if(biases != nullptr)
-    {
-        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
-
-    // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
-    }
-    return Status{};
-}
-} // namespace
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo           *src,
-                                                                        const ITensorInfo     *weights,
-                                                                        const ITensorInfo     *biases,
-                                                                        ITensorInfo           *dst,
-                                                                        const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                                             dst, info));
-
-    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _has_bias     = biases != nullptr;
-    _is_nchw      = src->data_layout() == DataLayout::NCHW;
-    _permute      = _is_nchw;
-    _is_prepared  = false;
-
-    // Configure pipeline
-    _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
-
-    _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
-    if(_is_nchw)
-    {
-        _permute_input   = std::make_unique<cpu::CpuPermute>();
-        _permute_weights = std::make_unique<cpu::CpuPermute>();
-        _permute_output  = std::make_unique<cpu::CpuPermute>();
-
-        auto input_perm   = std::make_unique<TensorInfo>();
-        auto weights_perm = std::make_unique<TensorInfo>();
-        auto output_perm  = std::make_unique<TensorInfo>();
-
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
-        input_perm->set_data_layout(DataLayout::NHWC);
-
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
-        weights_perm->set_data_layout(DataLayout::NHWC);
-
-        output_perm->set_data_layout(DataLayout::NHWC);
-        output_perm->set_quantization_info(dst->quantization_info());
-
-        // Configure optimized depthwise
-        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
-
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        output_perm->set_data_layout(DataLayout::NHWC);
-        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
-    }
-    else
-    {
-        _dwc_optimized_func->configure(src, weights, biases, dst, info);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(dst, nullptr, info.act_info);
-    }
-}
-
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo     *src,
-                                                                         const ITensorInfo     *weights,
-                                                                         const ITensorInfo     *biases,
-                                                                         const ITensorInfo     *dst,
-                                                                         const ConvolutionInfo &info)
-{
-    return validate_arguments_optimized(src, weights, biases, dst, info);
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    prepare(tensors);
-
-    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
-    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
-    // Permute input
-    if(_permute)
-    {
-        ITensorPack pack;
-        auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
-        pack.add_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, src_perm);
-        _permute_input->run(pack);
-    }
-
-    // Run assembly function
-    if(_is_nchw)
-    {
-        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
-        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
-        pack.add_tensor(TensorType::ACL_SRC_2, bias);
-        pack.add_tensor(TensorType::ACL_INT_0, workspace);
-        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
-        pack.add_tensor(TensorType::ACL_DST, dst_perm);
-        _dwc_optimized_func->run(pack);
-    }
-    else
-    {
-        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
-        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
-        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, src);
-        pack.add_tensor(TensorType::ACL_SRC_1, weights);
-        pack.add_tensor(TensorType::ACL_SRC_2, bias);
-        pack.add_tensor(TensorType::ACL_INT_0, workspace);
-        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _dwc_optimized_func->run(pack);
-    }
-
-    // Permute output
-    if(_is_nchw)
-    {
-        ITensorPack pack;
-        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output->run(pack);
-    }
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-
-        // Permute weights
-        if(_permute)
-        {
-            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
-
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC, weights);
-            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
-            _permute_weights->run(pack);
-
-            weights->mark_as_unused();
-
-            ITensorPack pack_opt;
-            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
-            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
-            // Prepare optimized function
-            _dwc_optimized_func->prepare(pack_opt);
-        }
-        else
-        {
-            ITensorPack pack_opt;
-            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
-            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-
-            // Prepare optimized function
-            _dwc_optimized_func->prepare(pack_opt);
-        }
-
-        _is_prepared = true;
-    }
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                            dst, info));
-
-    _is_nchw     = src->data_layout() == DataLayout::NCHW;
-    _is_prepared = !_is_nchw;
-
-    ITensorInfo       *input_to_use   = src;
-    const ITensorInfo *weights_to_use = weights;
-    ITensorInfo       *output_to_use  = dst;
-
-    auto input_perm   = std::make_unique<TensorInfo>();
-    auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-
-    if(_is_nchw)
-    {
-        _permute_input   = std::make_unique<cpu::CpuPermute>();
-        _permute_weights = std::make_unique<cpu::CpuPermute>();
-
-        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
-        input_perm->set_data_layout(DataLayout::NHWC);
-        input_to_use = input_perm.get();
-
-        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
-        weights_perm->set_data_layout(DataLayout::NHWC);
-        weights_to_use = weights_perm.get();
-
-        output_to_use = output_perm.get();
-    }
-
-    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
-    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
-
-    if(_is_nchw)
-    {
-        _permute_output = std::make_unique<cpu::CpuPermute>();
-        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
-        output_perm->set_data_layout(DataLayout::NHWC);
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = info.act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
-        _activationlayer_function->configure(dst, nullptr, info.act_info);
-    }
-}
-
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                                               const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    if(src->data_layout() == DataLayout::NCHW)
-    {
-        TensorShape permuted_input_shape   = src->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
-        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
-    }
-
-    // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
-    }
-
-    return Status{};
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
-{
-    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
-
-    if(_is_nchw)
-    {
-        prepare(tensors);
-        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, src_perm);
-        _permute_input->run(pack);
-
-        ITensorPack pack_depth;
-        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
-        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
-        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
-        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
-    }
-    else
-    {
-        ITensorPack pack_depth;
-        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
-        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
-        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
-        pack_depth.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
-    }
-
-    if(_is_nchw)
-    {
-        ITensorPack pack;
-        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
-        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output->run(pack);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        ARM_COMPUTE_ERROR_ON(!weights->is_used());
-
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, weights);
-        pack.add_tensor(TensorType::ACL_DST, weights_perm);
-
-        _permute_weights->run(pack);
-        weights->mark_as_unused();
-        _is_prepared = true;
-    }
-}
-
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(src, weights, biases, dst, info);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(src, weights, biases, dst, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
-    switch(depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                                                                   const ConvolutionInfo &info)
-{
-    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
-}
-
-void CpuDepthwiseConv2d::run(ITensorPack &tensors)
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.run(tensors);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run(tensors);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-
-void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.prepare(tensors);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare(tensors);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
deleted file mode 100644
index dd4839b28a..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
-#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to execute a depthwise convolution.
- */
-class CpuDepthwiseConv2d : public ICpuOperator
-{
-public:
-    /** Default constructor */
-    CpuDepthwiseConv2d() = default;
-    /** Initialize the function's source, destination, weights and convolution information.
-     *
-     * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
-     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
-     *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]      info    Depthwise convolution meta-data.
-     */
-    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDepthwiseConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
-     *
-     * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                    Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] dst     Destination tensor. Data type supported: same as @p src.
-     * @param[in] info    Depthwise convolution meta-data.
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                                                          const ConvolutionInfo &info);
-
-    // Inherited methods overriden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-
-private:
-    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
-    *
-    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
-    *
-    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
-    * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present
-    * -# @ref CpuActivation if fused activation is required
-    *
-    */
-    class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConv2dOptimizedInternal() = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConv2dOptimizedInternal() = default;
-        /** Initialize the function's source, destination, kernels and border_size.
-         *
-         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-        /** Static function to check if given info will lead to a valid configuration
-         *
-         * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
-        // Inherited methods overriden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
-        bool                                                _has_bias{ false };
-        bool                                                _is_quantized{ false };
-        bool                                                _is_nchw{ true };
-        bool                                                _permute{ false };
-        bool                                                _is_activationlayer_enabled{ false };
-        bool                                                _is_prepared{ false };
-    };
-
-    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
-     *
-     * -# @ref CpuDepthwiseConv2dNativeKernel
-     *
-     */
-    class CpuDepthwiseConv2dGeneric : public ICpuOperator
-    {
-    public:
-        /** Default constructor */
-        CpuDepthwiseConv2dGeneric() = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
-        /** Default move constructor */
-        CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
-        /** Default move assignment operator */
-        CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
-        /** Default destructor */
-        ~CpuDepthwiseConv2dGeneric() = default;
-        /** Initialize the function's source, destination, weights and convolution information.
-         *
-         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
-         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
-         * @param[in]      info    Depthwise convolution meta-data.
-         */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-
-        /** Static function to check if given info will lead to a valid configuration
-         *
-         * Similar to CpuDepthwiseConv2dGeneric::configure()
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
-        // Inherited methods overridden:
-        void run(ITensorPack &tensors) override;
-        void prepare(ITensorPack &tensors) override;
-
-    private:
-        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _is_prepared{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
-    };
-
-    DepthwiseConvolutionFunction        _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
-    CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
-    CpuDepthwiseConv2dGeneric           _func_generic{};
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
deleted file mode 100644
index 660ac0163c..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/utils/AssemblyUtils.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
-{
-    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
-    bool                                                              is_prepared{ false };
-    experimental::MemoryRequirements                                  mem_req{};
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
-    : _pImpl(std::make_unique<LocalImpl>())
-{
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
-
-void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
-                                                   const ITensorInfo     *weights,
-                                                   const ITensorInfo     *bias,
-                                                   ITensorInfo           *dst,
-                                                   const ConvolutionInfo &info)
-{
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-    _pImpl->is_prepared            = false;
-
-    // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
-    {
-        return;
-    }
-
-    auto dwc_wrapper = std::make_unique<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel>();
-    ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr);
-    dwc_wrapper->configure(src, weights, bias, dst, info, ci);
-
-    // Compute memory requirements for assembly kernels
-    constexpr size_t alignment = 4096;
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads, src->dimension(0)), alignment });
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
-    _pImpl->asm_kernel = std::move(dwc_wrapper);
-}
-
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
-}
-
-experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
-{
-    return _pImpl->mem_req;
-}
-
-bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
-{
-    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
-    return act.type != arm_gemm::Activation::Type::None;
-}
-
-void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    prepare(tensors);
-
-    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors);
-}
-
-void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
-{
-    if(!_pImpl->is_prepared)
-    {
-        // Pack weights and bias
-        const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-        ITensor       *storage = tensors.get_tensor(TensorType::ACL_INT_1);
-
-        const auto weights_ptr    = weights->buffer() + weights->info()->offset_first_element_in_bytes();
-        const auto bias_ptr       = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr;
-        auto       parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
-
-        const auto weights_shape   = weights->info()->tensor_shape();
-        const auto weights_padding = weights->info()->padding();
-
-        const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
-        const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
-        _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
-
-        weights->mark_as_unused();
-        if(bias != nullptr)
-        {
-            bias->mark_as_unused();
-        }
-        _pImpl->is_prepared = true;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
deleted file mode 100644
index f3d3b618c6..0000000000
--- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
-#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
-
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Depthwise convolution assembly kernel glue */
-class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
-{
-public:
-    CpuDepthwiseConv2dAssemblyDispatch();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
-    ~CpuDepthwiseConv2dAssemblyDispatch();
-    /** Initialize the function's source, destination, kernels and border_size.
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM].
-     *                     Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: same as @p src or S32 if @p src is quantized.
-     * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
-     * @param[in]  info    Depthwise convolution meta-data.
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Checks if activation is supported by the assembly kernels
-     *
-     * @param[in] activation Activation to check
-     *
-     * @return True if activation is supported else false
-     */
-    static bool is_activation_supported(const ActivationLayerInfo &activation);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    struct LocalImpl;
-    std::unique_ptr<LocalImpl> _pImpl;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */
diff --git a/src/runtime/cpu/operators/CpuDequantize.cpp b/src/runtime/cpu/operators/CpuDequantize.cpp
deleted file mode 100644
index 80a2e28aee..0000000000
--- a/src/runtime/cpu/operators/CpuDequantize.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDequantize.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuDequantizeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuDequantizeKernel::validate(src, dst);
-}
-
-void CpuDequantize::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    prepare(tensors);
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDequantize.h b/src/runtime/cpu/operators/CpuDequantize.h
deleted file mode 100644
index fdbd6a57c2..0000000000
--- a/src/runtime/cpu/operators/CpuDequantize.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H
-#define ARM_COMPUTE_CPU_DEQUANTIZE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */
-class CpuDequantize : public ICpuOperator
-{
-public:
-    /** Configure the kernel.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuDequantize::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp
deleted file mode 100644
index 8812b777a3..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConv2d.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuDirectConv2d::~CpuDirectConv2d() = default;
-
-CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
-{
-}
-
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
-    _conv_kernel          = std::make_unique<kernels::CpuDirectConv2dKernel>();
-    _input_border_handler = std::make_unique<NEFillBorderKernel>();
-
-    // Free accumulator
-    if(_accumulator.buffer() != nullptr)
-    {
-        _accumulator.allocator()->free();
-    }
-
-    _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
-
-    // Check if bias should be added in the convolution result
-    _has_bias = (bias != nullptr);
-
-    _conv_kernel->configure(src, weights, dst, conv_info);
-    if(_has_bias)
-    {
-        _output_stage_kernel->configure(dst, bias);
-    }
-    _is_padding_required = !_conv_kernel->border_size().empty();
-
-    if(_is_padding_required)
-    {
-        // Add zero padding XY
-        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function = std::make_unique<CpuActivation>();
-        _activationlayer_function->configure(dst, dst, act_info);
-    }
-}
-
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                 const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // output might not be initialized since it can be an intermediate tensor of another layer
-    DataType   data_type = src->data_type();
-    TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
-
-    // Validate Convolution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of input feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
-    }
-
-    // Validate bias kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
-
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
-    }
-
-    return Status{};
-}
-
-void CpuDirectConv2d::run(ITensorPack &tensors)
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_is_padding_required)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_DST, src);
-        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
-    }
-    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
-    if(_has_bias)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC_0, dst);
-        pack.add_tensor(TensorType::ACL_SRC_1, bias);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        ITensorPack pack;
-        pack.add_tensor(TensorType::ACL_SRC, dst);
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _activationlayer_function->run(pack);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h
deleted file mode 100644
index c17b076f85..0000000000
--- a/src/runtime/cpu/operators/CpuDirectConv2d.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
-#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Function to run the direct convolution.
- *
- *  This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel for the input
- * -# @ref kernels::CpuDirectConv2dOutputStageKernel
- * -# @ref kernels::CpuDirectConv2dKernel
- */
-class CpuDirectConv2d : public ICpuOperator
-{
-public:
-    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    ~CpuDirectConv2d();
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
-     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-     *
-     * @param[in, out] src       Input tensor info. Data types supported: F16/F32.
-     * @param[in]      weights   Set of kernels to convolve the input volume.
-     *                           Supported sizes: 1x1, 3x3 and 5x5.
-     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                           Data type supported: Same as @p src.
-     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
-     * @param[out]     dst       Output tensor info.
-     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDirectConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    MemoryGroup                                                _memory_group;
-    std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
-    std::unique_ptr<kernels::CpuDirectConv2dKernel>            _conv_kernel;
-    std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
-    std::unique_ptr<CpuActivation>                             _activationlayer_function;
-    Tensor                                                     _accumulator;
-    bool                                                       _has_bias{ false };
-    bool                                                       _is_activationlayer_enabled{ false };
-    unsigned int                                               _dim_split{ 0 };
-    bool                                                       _is_padding_required{ false };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuElementwise.cpp b/src/runtime/cpu/operators/CpuElementwise.cpp
deleted file mode 100644
index 8953d4769c..0000000000
--- a/src/runtime/cpu/operators/CpuElementwise.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuElementwise.h"
-#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuElementwiseBase::run(ITensorPack &tensors)
-{
-    // If the kernel has been configured, use the window from the kernel.
-    if(_kernel->is_window_configured())
-    {
-        ICpuOperator::run(tensors);
-        return;
-    }
-
-    auto src0_info        = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info();
-    auto src1_info        = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info();
-    auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape());
-    ICpuOperator::run(tensors, shape_and_window.second);
-}
-
-template <ArithmeticOperation op>
-void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuArithmeticKernel>();
-    k->configure(op, src0, src1, dst);
-    _kernel = std::move(k);
-}
-
-template <ArithmeticOperation op>
-Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst);
-}
-
-template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
-template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
-template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
-template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
-
-void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuDivisionKernel>();
-    k->configure(src0, src1, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    return kernels::CpuDivisionKernel::validate(src0, src1, dst);
-}
-
-void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuPowerKernel>();
-    k->configure(src0, src1, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    return kernels::CpuPowerKernel::validate(src0, src1, dst);
-}
-
-template <ComparisonOperation COP>
-void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuComparisonKernel>();
-    k->configure(COP, src0, src1, dst);
-    _kernel = std::move(k);
-}
-
-template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
-}
-
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
-{
-    auto k = std::make_unique<kernels::CpuComparisonKernel>();
-    k->configure(op, src0, src1, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
-{
-    return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
-}
-
-// Supported Specializations
-template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
-template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwise.h b/src/runtime/cpu/operators/CpuElementwise.h
deleted file mode 100644
index ef5caf2825..0000000000
--- a/src/runtime/cpu/operators/CpuElementwise.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuElementwiseBase : public ICpuOperator
-{
-public:
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power
- *
- * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32
- * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32.
- */
-template <ArithmeticOperation op>
-class CpuElementwiseArithmetic : public CpuElementwiseBase
-{
-public:
-    /** Configure the operator
-     *
-     * @param[in]  src0 The first source tensor information.
-     * @param[in]  src1 The second source tensor information. With PRelu, this is used as alpha tensor.
-     * @param[out] dst  The output tensor information.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuElementwiseArithmetic::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */
-using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */
-using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */
-using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
-
-/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
- *
- * @note The tensor data type for the inputs must be S32/F16/F32.
- * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
- */
-class CpuElementwiseDivision : public CpuElementwiseBase
-{
-public:
-    /** Initialise the kernel's inputs, dst and conversion policy.
-     *
-     * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuElementwiseDivision::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
- * @note For an exponent that is a float, this function will only work with a positive base.
- */
-class CpuElementwisePower : public CpuElementwiseBase
-{
-public:
-    /** Initialise the kernel's inputs, dst and conversion policy.
-     *
-     * @param[in, out] src0 First tensor input info. Data types supported: F16/F32.
-     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuElementwisePower::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a comparison operation between two tensors.
- */
-class CpuElementwiseComparison : public CpuElementwiseBase
-{
-public:
-    /** Initialise the kernel's inputs, dst and conversion policy.
-     *
-     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out]     dst  Output tensor info. Data types supported: U16/U32.
-     * @param[in]      op   Comparison Operation to be performed.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuElementwiseComparison::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
-};
-
-/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a comparison operation between two tensors.
- */
-template <ComparisonOperation op>
-class CpuElementwiseComparisonStatic : public CpuElementwiseBase
-{
-public:
-    /** Initialise the kernel's inputs, dst and conversion policy.
-     *
-     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out]     dst  Output tensor info. Data types supported: U16/U32.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuElementwiseComparisonStatic::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-};
-
-/** Basic function to run equal comparison. */
-using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
-/** Basic function to run not equal comparison. */
-using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-/** Basic function to run greater comparison. */
-using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
-/** Basic function to run greater-equal comparison. */
-using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-/** Basic function to run less comparison. */
-using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
-/** Basic function to run less-equal comparison. */
-using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp b/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
deleted file mode 100644
index c79e6e9acf..0000000000
--- a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuElementwiseUnary.h"
-#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using KernelType = kernels::CpuElementwiseUnaryKernel;
-
-void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
-{
-    auto k = std::make_unique<KernelType>();
-    k->configure(op, src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
-{
-    return KernelType::validate(op, src, dst);
-}
-
-void CpuElementwiseUnary::run(ITensorPack &tensors)
-{
-    if(_kernel->is_window_configured())
-    {
-        ICpuOperator::run(tensors);
-        return;
-    }
-
-    auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info();
-    ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.h b/src/runtime/cpu/operators/CpuElementwiseUnary.h
deleted file mode 100644
index 5ea29e07e9..0000000000
--- a/src/runtime/cpu/operators/CpuElementwiseUnary.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
-
-#include "arm_compute/core/Types.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuElementwiseUnary : public ICpuOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  op  Unary operation to execute
-     * @param[in]  src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
-     * @param[out] dst Output tensor information. Data types supported: Same as @p src.
-     */
-    void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuElementwiseUnary::configure()
-     *
-     * @return a status
-     */
-    static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuFill.cpp b/src/runtime/cpu/operators/CpuFill.cpp
deleted file mode 100644
index 081e30ea17..0000000000
--- a/src/runtime/cpu/operators/CpuFill.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFill.h"
-
-#include "src/core/cpu/kernels/CpuFillKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value)
-{
-    auto k = std::make_unique<kernels::CpuFillKernel>();
-    k->configure(tensor, constant_value);
-    _kernel = std::move(k);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFill.h b/src/runtime/cpu/operators/CpuFill.h
deleted file mode 100644
index b946467da6..0000000000
--- a/src/runtime/cpu/operators/CpuFill.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FILL_H
-#define ARM_COMPUTE_CPU_FILL_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuFillKernel */
-class CpuFill : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in,out] tensor         Tensor to fill. Supported data types: All
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     */
-    void configure(const ITensorInfo *tensor, PixelValue constant_value);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FILL_H */
diff --git a/src/runtime/cpu/operators/CpuFlatten.cpp b/src/runtime/cpu/operators/CpuFlatten.cpp
deleted file mode 100644
index 58e6e4b671..0000000000
--- a/src/runtime/cpu/operators/CpuFlatten.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFlatten.h"
-
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuReshapeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuReshapeKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFlatten.h b/src/runtime/cpu/operators/CpuFlatten.h
deleted file mode 100644
index 3e24a93429..0000000000
--- a/src/runtime/cpu/operators/CpuFlatten.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLATTEN_H
-#define ARM_COMPUTE_CPU_FLATTEN_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to flatten a given input */
-class CpuFlatten : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src            |dst            |
-     * |:--------------|:--------------|
-     * |All            |All            |
-     *
-     * @param[in] src Source tensor to flatten with at least 3 dimensions.
-     *                The dimensions above the third will be interpreted as batches. Data types supported: All
-     * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where:
-     *                            w = width input tensor, h = height input tensor and d = depth input tensor.
-     *                            Data type supported: same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuFlatten::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLATTEN_H */
diff --git a/src/runtime/cpu/operators/CpuFloor.cpp b/src/runtime/cpu/operators/CpuFloor.cpp
deleted file mode 100644
index 4e169a04be..0000000000
--- a/src/runtime/cpu/operators/CpuFloor.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFloor.h"
-
-#include "src/core/cpu/kernels/CpuFloorKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuFloorKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuFloorKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFloor.h b/src/runtime/cpu/operators/CpuFloor.h
deleted file mode 100644
index 0cd0cc0b4e..0000000000
--- a/src/runtime/cpu/operators/CpuFloor.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLOOR_H
-#define ARM_COMPUTE_CPU_FLOOR_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuFloorKernel */
-class CpuFloor : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in] src Source tensor info. Data types supported: F16/F32.
-     * @param[in] dst Destination tensor info. Data type supported: same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuFloor::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLOOR_H */
diff --git a/src/runtime/cpu/operators/CpuFullyConnected.cpp b/src/runtime/cpu/operators/CpuFullyConnected.cpp
deleted file mode 100644
index eeabce0753..0000000000
--- a/src/runtime/cpu/operators/CpuFullyConnected.cpp
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuFullyConnected.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
-#include "src/runtime/cpu/operators/CpuFlatten.h"
-#include "src/runtime/cpu/operators/CpuGemm.h"
-#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::experimental;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-// Get min, max bound of a quantized asymmetric dst tensor, with the effect of fused activation
-std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
-{
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(data_type);
-    const UniformQuantizationInfo q_unif = q_info.uniform();
-
-    if(act_info.enabled())
-    {
-        switch(act_info.activation())
-        {
-            case ActivationLayerInfo::ActivationFunction::RELU:
-                type_min = PixelValue(q_unif.offset);
-                break;
-            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                type_min = PixelValue(q_unif.offset);
-                type_max = PixelValue(act_info.a(), data_type, q_info);
-                break;
-            case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                type_min = PixelValue(act_info.b(), data_type, q_info);
-                type_max = PixelValue(act_info.a(), data_type, q_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Activation function not supported.");
-                break;
-        }
-    }
-
-    return std::make_pair(type_min, type_max);
-}
-
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
-{
-    const auto                    data_type = src->data_type();
-    const QuantizationInfo        oq_info   = dst->quantization_info();
-    const UniformQuantizationInfo iq_unif   = src->quantization_info().uniform();
-    const UniformQuantizationInfo wq_unif   = weights->quantization_info().uniform();
-    const UniformQuantizationInfo oq_unif   = oq_info.uniform();
-
-    float   multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
-    int32_t output_multiplier;
-    int32_t output_shift;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
-
-    gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
-    gemmlowp_output_stage_info.gemmlowp_shift      = output_shift;
-    gemmlowp_output_stage_info.gemmlowp_offset     = oq_unif.offset;
-    gemmlowp_output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage_info.gemmlowp_min_bound  = type_min.get<int32_t>();
-    gemmlowp_output_stage_info.gemmlowp_max_bound  = type_max.get<int32_t>();
-
-    return Status{};
-}
-
-Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act)
-{
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
-
-        GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
-
-        GEMMInfo gemm_info;
-        gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
-
-        // Validate gemmlowp function
-        TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
-        TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
-                                                                            &weights_info,
-                                                                            biases,
-                                                                            dst,
-                                                                            gemm_info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
-    }
-
-    return Status{};
-}
-} // namespace
-
-CpuFullyConnected::CpuFullyConnected()
-    : _flatten(nullptr),
-      _convert_weights(nullptr),
-      _transpose_weights(nullptr),
-      _mm_gemm(nullptr),
-      _mm_gemmlowp(nullptr),
-      _flattened_src(),
-      _converted_weights(),
-      _reshaped_weights(),
-      _trans_weights(),
-      _trans_weights_idx(AuxTensorIdx::Count),
-      _aux_mem(Count),
-      _needs_weights_conversion(false),
-      _needs_weights_reshape(false),
-      _is_fc_after_conv(false),
-      _is_quantized_asymmetric(false),
-      _is_prepared(false)
-
-{
-}
-
-CpuFullyConnected::~CpuFullyConnected() = default;
-
-void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
-{
-    if(_is_quantized_asymmetric)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
-
-        TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
-        TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
-
-        // Configure gemmlowp function and output stage for asymmetric quantized types
-        GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        const Status            status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
-        ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
-
-        GEMMInfo gemm_info;
-        gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
-        gemm_info.set_activation_info(act);
-        _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info);
-    }
-    else
-    {
-        // Configure matrix multiply kernel
-        GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */);
-        gemm_info.set_activation_info(act);
-        _mm_gemm = std::make_unique<CpuGemm>();
-        _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info);
-    }
-}
-
-void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
-{
-    ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
-
-    // If the fully connected layer is called after a convolution layer, the src tensor must be linearized
-
-    // Initialize output tensor for flatten
-    auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src)));
-
-    _flatten = std::make_unique<CpuFlatten>();
-    _flatten->configure(src, &_flattened_src);
-
-    // Configure matrix multiply kernel
-    configure_mm(&_flattened_src, weights, biases, dst, act);
-}
-
-void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
-{
-    ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
-
-    // Configure matrix multiply kernel
-    configure_mm(src, weights, biases, dst, act);
-}
-
-void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  FullyConnectedLayerInfo fc_info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
-                                                           weights,
-                                                           biases != nullptr ? biases : nullptr,
-                                                           dst,
-                                                           fc_info));
-
-    _needs_weights_conversion = false;
-    _needs_weights_reshape    = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
-    _needs_weights_reshape    = _needs_weights_reshape && !fc_info.retain_internal_weights;
-    _is_fc_after_conv         = true;
-    _is_quantized_asymmetric  = is_data_type_quantized_asymmetric(src->data_type());
-    _is_prepared              = false;
-    _trans_weights_idx        = AuxTensorIdx::Count;
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const ITensorInfo *weights_to_use = weights;
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    if(is_batched_fc_layer)
-    {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                  src->tensor_shape().cend(),
-                                                                                  dst->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        _is_fc_after_conv = src->num_dimensions() > 1;
-    }
-
-    // Reshape weights if needed
-    if(_needs_weights_reshape)
-    {
-        // Reshape the weights
-        _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
-        _transpose_weights->configure(weights, &_reshaped_weights);
-        weights_to_use     = &_reshaped_weights;
-        _trans_weights_idx = AuxTensorIdx::TransposedWeights;
-    }
-
-    // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
-    {
-        // Convert weights
-        _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
-        _convert_weights->configure(weights_to_use,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
-                                    fc_info.weights_trained_layout);
-
-        weights_to_use            = &_converted_weights;
-        _needs_weights_conversion = true;
-        _trans_weights_idx        = AuxTensorIdx::ConvertedWeights;
-    }
-
-    if(_is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
-    }
-
-    // Retain the tensorinfo with the weights to use
-    if(_needs_weights_reshape || _needs_weights_conversion)
-    {
-        _trans_weights = *weights_to_use;
-    }
-
-    // Set auxiliary memory requirements
-    auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-    for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
-    {
-        _aux_mem[i] = gemm_mem_req[i];
-    }
-
-    if(_aux_mem[Pretranspose].size > 0)
-    {
-        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
-    }
-    else
-    {
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size());
-    }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
-}
-
-Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   FullyConnectedLayerInfo fc_info)
-{
-    ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!fc_info.constant_weights, "Non-constant weights are currently not supported");
-
-    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    bool is_fc_after_conv = true;
-
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const ITensorInfo *src_to_use     = src;
-    const ITensorInfo *weights_to_use = weights;
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = dst->dimension(1) > 1;
-
-    if(is_batched_fc_layer)
-    {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                 src->tensor_shape().cend(),
-                                                                                 dst->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        is_fc_after_conv = src->num_dimensions() > 1;
-    }
-
-    if(!weights_reshaped)
-    {
-        // Validate reshape weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
-        weights_to_use = &reshaped_weights;
-    }
-
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
-    {
-        // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                              &converted_weights,
-                                                                              src->tensor_shape(),
-                                                                              fc_info.weights_trained_layout));
-        weights_to_use = &converted_weights;
-    }
-
-    if(is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
-
-        // Validate flatten kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
-        src_to_use = &flatten_src;
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
-    }
-    // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info));
-
-    return Status{};
-}
-
-void CpuFullyConnected::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto src = tensors.get_const_tensor(ACL_SRC_0);
-
-    CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
-    CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
-
-    // Linearize src if it comes from a convolutional layer
-    if(_is_fc_after_conv)
-    {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
-        _flatten->run(flatten_pack);
-    }
-
-    ITensorPack gemm_pack = tensors;
-    gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_needs_weights_reshape || _needs_weights_conversion)
-    {
-        gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
-    }
-
-    // Run matrix multiply
-    if(_is_quantized_asymmetric)
-    {
-        _mm_gemmlowp->run(gemm_pack);
-    }
-    else
-    {
-        _mm_gemm->run(gemm_pack);
-    }
-}
-
-void CpuFullyConnected::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights = tensors.get_const_tensor(ACL_SRC_1);
-
-        CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
-        CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
-
-        // Pointer to current weights
-        const ITensor *cur_weights = weights;
-
-        // Reshape of the weights (happens only once)
-        if(_needs_weights_reshape)
-        {
-            // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
-            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
-
-            cur_weights->mark_as_unused();
-            cur_weights = reshaped_weights.get();
-        }
-
-        // Convert weights if needed (happens only once)
-        if(_needs_weights_conversion)
-        {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
-            _convert_weights->run(convert_pack);
-
-            cur_weights->mark_as_unused();
-            cur_weights = converted_weights.get();
-        }
-
-        ITensorPack gemm_pack = tensors;
-        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
-
-        // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized_asymmetric)
-        {
-            _mm_gemm->prepare(gemm_pack);
-        }
-        else
-        {
-            _mm_gemmlowp->prepare(gemm_pack);
-        }
-
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements CpuFullyConnected::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuFullyConnected.h b/src/runtime/cpu/operators/CpuFullyConnected.h
deleted file mode 100644
index 498ceae68d..0000000000
--- a/src/runtime/cpu/operators/CpuFullyConnected.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include "arm_compute/core/TensorInfo.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-// Forward declarations
-class CpuConvertFullyConnectedWeights;
-class CpuFlatten;
-class CpuGemm;
-class CpuGemmLowpMatrixMultiplyCore;
-namespace kernels
-{
-class CpuTransposeKernel;
-} // namespace kernels
-/** Basic function to compute a Fully Connected layer. This function calls the following kernels:
- *  -# @ref kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref kernels::CpuTransposeKernel (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref CpuGemm or @ref CpuGemmLowpMatrixMultiplyCore (if quantized asymmetric)
- *  -# @ref kernels::CpuGemmMatrixAdditionKernel or @ref CpuGemmLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CpuFullyConnected : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuFullyConnected();
-    /** Destructor */
-    ~CpuFullyConnected();
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2   |dst            |
-     * |:--------------|:------------------|:------|:--------------|
-     * |F16            |F16                |F16    |F16            |
-     * |F32            |F32                |F32    |F32            |
-     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
-     *
-     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
-     *                     If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                     If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                     Data type supported: Same as @p src.
-     * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] dst     Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
-     *                     - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                     - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                     Data type supported: Same as @p src.
-     * @param[in]  fc_info (Optional) Fully connected layer additional info
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
-     *
-     * Similar to @ref CpuFullyConnected
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-    //Inherited methods override
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-
-    enum AuxTensorIdx
-    {
-        AsmGemmWorkspace = 0,
-        Pretranspose,
-        GemmTemp1, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp2, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp3, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp4, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp5, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp6, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp7, // CpuGemmLowpMatrixMultiplyCore only
-        TransposedWeights,
-        ConvertedWeights,
-        FlattenedSrc,
-        Count
-    };
-
-    std::unique_ptr<CpuFlatten>                      _flatten;
-    std::unique_ptr<CpuConvertFullyConnectedWeights> _convert_weights;
-    std::unique_ptr<kernels::CpuTransposeKernel>     _transpose_weights;
-    std::unique_ptr<CpuGemm>                         _mm_gemm;
-    std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>   _mm_gemmlowp;
-
-    TensorInfo   _flattened_src;
-    TensorInfo   _converted_weights;
-    TensorInfo   _reshaped_weights;
-    TensorInfo   _trans_weights;
-    AuxTensorIdx _trans_weights_idx;
-
-    experimental::MemoryRequirements _aux_mem;
-
-    bool _needs_weights_conversion;
-    bool _needs_weights_reshape;
-    bool _is_fc_after_conv;
-    bool _is_quantized_asymmetric;
-    bool _is_prepared;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FULLY_CONNECTED_H */
diff --git a/src/runtime/cpu/operators/CpuGemm.cpp b/src/runtime/cpu/operators/CpuGemm.cpp
deleted file mode 100644
index bd3f231001..0000000000
--- a/src/runtime/cpu/operators/CpuGemm.cpp
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuGemm.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-using namespace arm_compute::experimental;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
-{
-    cpu::AsmGemmInfo asm_info;
-    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
-    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
-    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
-    asm_info.activation_info         = info.activation_info();
-    asm_info.fast_mode               = info.fast_math();
-
-    return asm_info;
-}
-} // namespace
-
-void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
-
-    const cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool             is_c_bias     = gemm_info.reshape_b_only_on_first_run();
-    bool                   run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _is_prepared                      = false;
-    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
-    _run_vector_matrix_multiplication = a->dimension(1) < 2;
-    _run_alpha_scale                  = alpha != 1.f;
-    _run_bias_addition                = c != nullptr && gemm_info.reshape_b_only_on_first_run();
-    _run_addition                     = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
-    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised
-                                                                                                     && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
-
-    if(run_optimised)
-    {
-        const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
-        _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
-        _asm_glue->configure(a, b, c_to_use, d, asm_info);
-        ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
-
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretraspose]      = asm_mem_req[Pretraspose];
-
-        // Scale product by alpha
-        if(_run_alpha_scale)
-        {
-            _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
-            _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
-        }
-    }
-    else
-    {
-        // Pick output tensor in case bias addition should be performed
-        ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
-
-        _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
-
-        // Select between GEMV and GEMM
-        if(_run_vector_matrix_multiplication)
-        {
-            // Configure the matrix multiply kernel
-            _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
-        }
-        else
-        {
-            const int m = a->dimension(1);
-            const int n = b->dimension(0);
-            const int k = a->dimension(0);
-
-            // Configure interleave kernel
-            _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
-            _interleave_kernel->configure(a, &_tmp_a);
-            _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
-
-            // Configure transpose kernel
-            _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
-            _transpose_kernel->configure(b, &_tmp_b);
-            _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
-
-            // Configure matrix multiplication kernel
-            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
-        }
-
-        if(_run_bias_addition)
-        {
-            _add_bias = std::make_unique<cpu::CpuAdd>();
-            _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
-            _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
-        }
-    }
-
-    // Configure matrix addition kernel
-    if(_run_addition)
-    {
-        _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
-        _ma_kernel->configure(c, d, beta);
-    }
-
-    // Configure activation
-    if(_run_activation)
-    {
-        _activation_func = std::make_unique<cpu::CpuActivation>();
-        _activation_func->configure(d, nullptr, gemm_info.activation_info());
-    }
-}
-
-Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    if(a->data_type() != DataType::BFLOAT16)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
-    }
-
-    if(c != nullptr && !is_c_bias)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
-        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-    }
-
-    if(d->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != d->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
-        {
-            if(gemm_info.reinterpret_input_as_3d())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1) * d->dimension(2));
-            }
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
-        }
-    }
-
-    // Check if we need to run the optimized assembly kernel
-    cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool       run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info));
-
-    if(!run_optimised)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
-
-        // Check if the first input tensor is a vector.
-        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        // Check if we need to reshape the matrix A and matrix B
-        const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
-
-        // Arguments used by GEMMReshapeInfo
-        // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
-        // in order to know how the matrices have been reshaped
-        const int m                         = a->dimension(1);
-        const int n                         = b->dimension(0);
-        const int k                         = a->dimension(0);
-        int       mult_transpose1xW_width   = 1;
-        int       mult_interleave4x4_height = 1;
-
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
-
-        const ITensorInfo *matrix_a_info = a;
-        const ITensorInfo *matrix_b_info = b;
-
-        TensorInfo tmp_a_info{};
-        TensorInfo tmp_b_info{};
-        TensorInfo tmp_output_info = *d->clone();
-
-        if(run_interleave_transpose)
-        {
-            matrix_a_info = &tmp_a_info;
-            matrix_b_info = &tmp_b_info;
-
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
-
-            // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
-        }
-
-        // Validate matrix multiply
-        auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
-
-        if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
-        }
-    }
-
-    // Validate matrix addition kernel
-    if(beta != 0 && c != nullptr && !is_c_bias)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
-    }
-
-    // Validate activation
-    const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
-    }
-
-    return Status{};
-}
-
-void CpuGemm::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto a = tensors.get_const_tensor(ACL_SRC_0);
-    auto b = tensors.get_const_tensor(ACL_SRC_1);
-    auto c = tensors.get_const_tensor(ACL_SRC_2);
-    auto d = tensors.get_tensor(ACL_DST);
-
-    if(_asm_glue->is_configured())
-    {
-        // Pass c to asm dispatch only if it's the bias tensor
-        ITensorPack asm_pack = tensors;
-        asm_pack.add_const_tensor(ACL_SRC_2, (_reshape_b_only_on_first_run) ? c : nullptr);
-        _asm_glue->run(asm_pack);
-        if(_run_alpha_scale)
-        {
-            ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
-            _alpha_scale_func->run(pack);
-        }
-    }
-    else
-    {
-        CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
-        CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
-        CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
-
-        ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
-        if(!_run_vector_matrix_multiplication)
-        {
-            // Run interleave kernel
-            ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
-            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
-
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
-            }
-
-            // Use reshaped matrices
-            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
-            mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
-        }
-
-        NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
-
-        // Run bias addition kernel
-        if(_run_bias_addition)
-        {
-            ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
-            _add_bias->run(pack);
-        }
-    }
-
-    // Run matrix addition kernel
-    if(_run_addition)
-    {
-        ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
-        NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
-    }
-
-    // Run activation function
-    if(_run_activation)
-    {
-        ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
-        _activation_func->run(pack);
-    }
-}
-
-void CpuGemm::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        if(_asm_glue->is_configured())
-        {
-            _asm_glue->prepare(tensors);
-        }
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
-        {
-            const ITensor *b     = tensors.get_const_tensor(ACL_SRC_1);
-            ITensor       *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
-            ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
-
-            CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
-            ITensorPack         transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
-        }
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements CpuGemm::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuGemm.h b/src/runtime/cpu/operators/CpuGemm.h
deleted file mode 100644
index 8d859791f5..0000000000
--- a/src/runtime/cpu/operators/CpuGemm.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_H
-#define ARM_COMPUTE_CPU_GEMM_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
-#include "src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
-#include "src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
-#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuAdd.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to execute GEMM. This function calls the following kernels:
- *
- * If optimized assembly is available:
- *  -# @ref cpu::CpuGemmAssemblyDispatch
- *  -# @ref cpu::CpuActivation (if alpha != 1.0)
- * Else:
- *  -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix)
- *  -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix)
- *  -# @ref cpu::kernels::CpuGemmMatrixMultiplyKernel
- * In both cases:
- *  -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
- * Else:
- *  -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place)
- *
- *  -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo)
- */
-class CpuGemm : public ICpuOperator
-{
-public:
-    /** Default constructor */
-    CpuGemm() = default;
-    /** Default destructor */
-    ~CpuGemm() = default;
-    /** Configure operator for a given list of arguments
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0         |src1        |src2      |dst            |
-     * |:------------|:-----------|:---------|:--------------|
-     * |F32          |F32         |F32       |F32            |
-     * |F16          |F16         |F16       |F16            |
-     * |BFLOAT16     |BFLOAT16    |BFLOAT16  |BFLOAT16       |
-     *
-     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
-     *
-     * @param[in]  a         First input tensor info (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
-     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a
-     * @param[in]  c         Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
-     * @param[out] d         Output tensor info. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
-     */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
-     *
-     * Similar to @ref CpuGemm::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                           float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum AuxTensorIdx
-    {
-        AsmGemmWorkspace = 0,
-        Pretraspose,
-        InterleavedLHS,
-        TransposedRHS,
-        TempResult,
-        Count
-    };
-
-    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
-    std::unique_ptr<CpuActivation>                        _alpha_scale_func{ nullptr };
-    std::unique_ptr<CpuAdd>                               _add_bias{ nullptr };
-    std::unique_ptr<CpuActivation>                        _activation_func{ nullptr };
-
-    TensorInfo _tmp_a{};
-    TensorInfo _tmp_b{};
-    TensorInfo _tmp_d{};
-
-    bool _run_vector_matrix_multiplication{ false };
-    bool _run_alpha_scale{ false };
-    bool _run_addition{ false };
-    bool _run_bias_addition{ false };
-    bool _run_activation{ false };
-    bool _reshape_b_only_on_first_run{ false };
-    bool _is_prepared{ false };
-
-    experimental::MemoryRequirements _aux_mem{ Count };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMM_H */
diff --git a/src/runtime/cpu/operators/CpuGemmConv2d.cpp b/src/runtime/cpu/operators/CpuGemmConv2d.cpp
deleted file mode 100644
index a81dd8a661..0000000000
--- a/src/runtime/cpu/operators/CpuGemmConv2d.cpp
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuGemmConv2d.h"
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include "src/core/cpu/kernels/CpuCol2ImKernel.h"
-#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuGemm.h"
-#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
-#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-#include <set>
-#include <tuple>
-
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuGemmConv2d::CpuGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
-      _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
-{
-}
-CpuGemmConv2d::~CpuGemmConv2d() = default;
-
-void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
-                                 bool enable_fast_math, int gemm_3d_depth)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col));
-
-    // Create GEMMInfo structure
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info);
-
-    // Supported activations in GEMM
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-
-    if(_is_quantized)
-    {
-        TensorInfo tmp_src{ *src };
-        TensorInfo tmp_weights{ *weights };
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo        iqinfo    = src->quantization_info();
-        const QuantizationInfo        wqinfo    = weights->quantization_info();
-        const QuantizationInfo        oqinfo    = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
-        const UniformQuantizationInfo uiqinfo   = iqinfo.uniform();
-        const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
-        const DataType                data_type = src->data_type();
-
-        tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
-        if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
-        {
-            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
-            tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
-        }
-
-        // Merge activation with output stage
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-        int32_t min_activation = type_min.get<int32_t>();
-        int32_t max_activation = type_max.get<int32_t>();
-
-        if(supported_acts.count(act_info.activation()) != 0)
-        {
-            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
-        }
-
-        GEMMLowpOutputStageInfo output_info;
-        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset          = uoqinfo.offset;
-        output_info.gemmlowp_min_bound       = min_activation;
-        output_info.gemmlowp_max_bound       = max_activation;
-        output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL);
-        quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
-
-        _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info));
-
-        auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
-        {
-            _aux_mem[cont] = mm_mem_req[cont];
-        }
-    }
-    else
-    {
-        // Configure matrix multiply function
-        _mm_gemm = std::make_unique<CpuGemm>();
-        _mm_gemm->configure(src, weights, biases, dst, 1.0f, 0.0f, gemm_info);
-        auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
-        {
-            _aux_mem[cont] = mm_mem_req[cont];
-        }
-    }
-}
-
-Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col)
-{
-    const DataType data_type             = src->data_type();
-    const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
-    const bool     is_activation_enabled = act_info.enabled();
-
-    // Create GEMMInfo structure
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info);
-
-    if(is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo       &iqinfo  = src->quantization_info();
-        const QuantizationInfo       &wqinfo  = weights->quantization_info();
-        const QuantizationInfo       &oqinfo  = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
-        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
-
-        // Merge activation with output stage
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-        int32_t min_activation = type_min.get<int32_t>();
-        int32_t max_activation = type_max.get<int32_t>();
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
-        }
-
-        GEMMLowpOutputStageInfo output_info;
-        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset          = uoqinfo.offset;
-        output_info.gemmlowp_min_bound       = min_activation;
-        output_info.gemmlowp_max_bound       = max_activation;
-        output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
-
-        // Perform validation step on GEMMLowp
-        std::unique_ptr<ITensorInfo> input_qa   = src->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
-        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info,
-                                                                                                               false, enable_fast_math, false, act_info));
-    }
-    else
-    {
-        // Perform validation step on Matrix multiply function
-        return CpuGemm::validate(src, weights, nullptr, dst, 1.0f, 0.0f, gemm_info);
-    }
-}
-
-Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
-    const DataType     data_type = input_info->data_type();
-    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
-    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
-
-    // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
-    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
-
-    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
-}
-
-void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                              const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_UNUSED(num_groups, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
-                                                       weights,
-                                                       biases,
-                                                       dst,
-                                                       conv_info,
-                                                       weights_info,
-                                                       dilation,
-                                                       act_info,
-                                                       enable_fast_math,
-                                                       num_groups));
-
-    const DataType   data_type   = src->data_type();
-    const DataLayout data_layout = src->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-
-    _is_prepared  = weights_info.retain_internal_weights();
-    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _data_layout  = data_layout;
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    const ITensorInfo *gemm_input_to_use  = src;
-    ITensorInfo       *gemm_output_to_use = dst;
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-    ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
-                             "Output shape does not match the expected one");
-
-    // Check if GEMM3D is supported
-    if(data_layout == DataLayout::NHWC)
-    {
-        _skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!_skip_col2im)
-        {
-            _skip_im2col = false;
-        }
-    }
-    else
-    {
-        _skip_col2im = false;
-    }
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm
-    _weights_reshape_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
-    _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped);
-    _weights_reshaped.set_quantization_info(weights->quantization_info());
-
-    // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
-    {
-        // Configure
-        _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
-        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
-
-        // Update GEMM input
-        gemm_input_to_use = &_im2col_output;
-    }
-
-    // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!_skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        // Calculate GEMM output shape
-        shape_gemm = _im2col_output.tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        _gemm_output = TensorInfo(shape_gemm, 1, output_data_type);
-        _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
-        _gemm_output_3d = TensorInfo(_gemm_output);
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output;
-    }
-    else
-    {
-        _gemm_output_3d = TensorInfo(*dst);
-        _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true);
-        _gemm_output = TensorInfo(_gemm_output_3d);
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output_3d;
-    }
-
-    // Configure GEMM
-    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth);
-
-    if(!_skip_col2im && _data_layout == DataLayout::NCHW)
-    {
-        // Configure col2im
-        _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
-        _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h));
-    }
-    else
-    {
-        // Configure reshape layer
-        _reshape_kernel = std::make_unique<kernels::CpuReshapeKernel>();
-        _reshape_kernel->configure(gemm_output_to_use, dst);
-    }
-
-    // Check if GEMM transforms weights
-    // Modernise through COMPMID-4535
-    bool gemm_trans_wei = _aux_mem[1].size > 0;                                            // Asm Pretranspose
-    gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei;     // Tranpose RHS
-    gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
-
-    // Check lifetime
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
-}
-
-Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
-
-    const DataLayout data_layout = src->data_layout();
-    const DataType   data_type   = src->data_type();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-
-    TensorInfo         im2col_reshaped_info{};
-    TensorInfo         info_gemm{};
-    TensorInfo         tmp_info{};
-    TensorInfo         weights_reshaped_info{};
-    const ITensorInfo *gemm_input_to_use  = src;
-    const ITensorInfo *gemm_output_to_use = dst;
-    const ITensorInfo *weights_to_use     = weights;
-
-    const bool append_bias  = false;
-    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
-    const bool is_bf16      = data_type == DataType::BFLOAT16;
-    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    // Check if GEMM3D is supported
-    bool skip_col2im = false;
-    if(data_layout == DataLayout::NHWC)
-    {
-        skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!skip_col2im)
-        {
-            skip_im2col = false;
-        }
-    }
-
-    if(skip_col2im)
-    {
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(src, weights, act_info, conv_h, skip_im2col)))
-        {
-            skip_im2col = false;
-            skip_col2im = false;
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else if(is_bf16)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
-
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
-    weights_reshaped_info.set_quantization_info(weights->quantization_info());
-    weights_to_use = &weights_reshaped_info;
-
-    if(!skip_im2col)
-    {
-        // Create tensor info for im2col reshaped inputs
-        // For CPU, the batch size is on the fourth dimension
-        TensorShape shape_im2col = src->tensor_shape();
-        shape_im2col.set(0, mat_weights_rows);
-        shape_im2col.set(1, conv_w * conv_h);
-        shape_im2col.set(2, 1);
-
-        im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
-        im2col_reshaped_info.set_quantization_info(src->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
-        gemm_input_to_use = &im2col_reshaped_info;
-    }
-
-    // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!skip_col2im)
-    {
-        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-        info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
-    }
-    else
-    {
-        info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type);
-    }
-    info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
-    gemm_output_to_use = &info_gemm;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col));
-
-    // Validate Col2Im/ReshapeLayer
-    if(!skip_col2im && (data_layout == DataLayout::NCHW))
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
-    }
-
-    return Status{};
-}
-
-void CpuGemmConv2d::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto src               = tensors.get_const_tensor(ACL_SRC_0);
-    auto dst               = tensors.get_tensor(ACL_DST);
-    auto gemm_input_to_use = src;
-
-    CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
-    CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
-    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
-
-    bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
-    if(!_skip_im2col)
-    {
-        // Run input reshaping
-        unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        ITensorPack  pack =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
-        NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
-        gemm_input_to_use = im2col_output.get();
-    }
-
-    // Handle the case where output has top/bottom padding
-    const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst;
-    Tensor         gemm3d;
-    _gemm_output_3d.extend_padding(out_to_use->info()->padding());
-    gemm3d.allocator()->soft_init(_gemm_output_3d);
-    gemm3d.allocator()->import_memory(out_to_use->buffer());
-    auto gemm_output_to_use = gemm_output.get();
-
-    if(_skip_im2col)
-    {
-        gemm_output_to_use = &gemm3d;
-    }
-    if(_skip_col2im && !out_has_padding)
-    {
-        gemm_output_to_use = dst;
-    }
-
-    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
-    ITensorPack pack_mm = tensors;
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
-    pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
-    if(_is_quantized)
-    {
-        // Run gemmlowp
-        _mm_gemmlowp->run(pack_mm);
-    }
-    else
-    {
-        // Run gemm
-        _mm_gemm->run(pack_mm);
-    }
-
-    // Reshape output matrix
-    if(!_skip_col2im)
-    {
-        if(_data_layout == DataLayout::NCHW)
-        {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output.get() },
-                { TensorType::ACL_DST, dst }
-            };
-            NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
-        }
-        else
-        {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output_to_use },
-                { TensorType::ACL_DST, dst }
-            };
-            NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
-        }
-    }
-    else if(out_has_padding)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
-        NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
-    }
-}
-
-void CpuGemmConv2d::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        // Run weights reshaping and mark original weights tensor as unused
-        CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
-        auto                weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack         pack =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
-        NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
-        weights->mark_as_unused();
-
-        // Prepare GEMM
-        ITensorPack gemm_pack = tensors;
-        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-        _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack);
-
-        _is_prepared = true;
-    }
-}
-experimental::MemoryRequirements CpuGemmConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmConv2d.h b/src/runtime/cpu/operators/CpuGemmConv2d.h
deleted file mode 100644
index 529256594f..0000000000
--- a/src/runtime/cpu/operators/CpuGemmConv2d.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_CONV2D_H
-#define ARM_COMPUTE_CPU_GEMM_CONV2D_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuGemm;
-class CpuGemmLowpMatrixMultiplyCore;
-class CpuGemmLowpOutputStage;
-namespace kernels
-{
-class CpuWeightsReshapeKernel;
-class CpuIm2ColKernel;
-class CpuCol2ImKernel;
-class CpuReshapeKernel;
-} // namespace kernels
-
-/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
- *
- * -# @ref cpu::kernels::CpuIm2ColKernel
- * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
- * -# @ref kernels::CpuWeightsReshapeKernel
- *
- */
-class CpuGemmConv2d : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuGemmConv2d();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuGemmConv2d(const CpuGemmConv2d &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuGemmConv2d(CpuGemmConv2d &&) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuGemmConv2d &operator=(const CpuGemmConv2d &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CpuGemmConv2d &operator=(CpuGemmConv2d &&) = delete;
-    /** Destructor */
-    ~CpuGemmConv2d();
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2     |dst            |
-     * |:--------------|:------------------|:--------|:--------------|
-     * |F16            |F16                |F16      |F16            |
-     * |F32            |F32                |F32      |F32            |
-     * |BFLOAT16       |BFLOAT16           |BFLOAT16 |BFLOAT16       |
-     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
-     *
-     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs.
-     *                              Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                              Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                              Data types supported: Same as @p input.
-     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
-     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmConvolution::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           bool enable_fast_math = false, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    /** Configures the appropriate matrix multiply routine
-     *
-     * @param[in]  src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in]  weights          Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst              Output tensor info. Data types supported: Same as @p input,
-     *                              except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
-     * @param[in]  gemm_3d_depth    (Optional) Depth of GEMM 3D (Defaults to 1)
-     */
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                      bool enable_fast_math = false, int gemm_3d_depth = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
-     *
-     * @param[in] src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights          Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst              Output tensor info. Data types supported: Same as @p input,
-     *                             except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                             available which may introduce a drop of accuracy as well. Default is false
-     * @param[in] gemm_3d_depth    (Optional) Depth of GEMM 3D (Defaults to 1)
-     * @param[in] skip_im2col      (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
-     *
-     * @return a status
-     */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                              bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false);
-    /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
-     *
-     * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights       Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] act_info      Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] gemm_3d_depth Depth of GEMM 3D
-     * @param[in] skip_im2col   Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
-     *
-     * @return a status
-     */
-    static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
-
-    enum AuxTensorIdx
-    {
-        // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors
-        Im2ColOutput = 9,
-        WeightsReshaped,
-        GemmOutput,
-        Count
-    };
-
-    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_kernel;
-    std::unique_ptr<cpu::kernels::CpuIm2ColKernel>    _im2col_kernel;
-    std::unique_ptr<CpuGemm>                          _mm_gemm;
-    std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
-    std::unique_ptr<kernels::CpuCol2ImKernel>         _col2im_kernel;
-    std::unique_ptr<kernels::CpuReshapeKernel>        _reshape_kernel;
-
-    TensorInfo _im2col_output;
-    TensorInfo _weights_reshaped;
-    TensorInfo _gemm_output;
-    TensorInfo _gemm_output_3d;
-
-    DataLayout _data_layout;
-
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _is_prepared;
-
-    experimental::MemoryRequirements _aux_mem{ Count };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_CONV2D_H */
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
deleted file mode 100644
index 10eece99eb..0000000000
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-#include "support/Cast.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::experimental;
-using namespace arm_compute::utils::cast;
-
-namespace
-{
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
-{
-    // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-    // Extract and negate input and weights offset
-    const QuantizationInfo        iqinfo    = src->quantization_info();
-    const QuantizationInfo        wqinfo    = weights->quantization_info();
-    const QuantizationInfo        oqinfo    = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
-    const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
-    const DataType                data_type = src->data_type();
-    // Merge activation with output stage
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(data_type);
-    int32_t min_activation = type_min.get<int32_t>();
-    int32_t max_activation = type_max.get<int32_t>();
-    if(supported_acts.count(act.activation()) != 0)
-    {
-        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
-    }
-    GEMMLowpOutputStageInfo os_info;
-    os_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    os_info.gemmlowp_offset          = uoqinfo.offset;
-    os_info.gemmlowp_min_bound       = min_activation;
-    os_info.gemmlowp_max_bound       = max_activation;
-    os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-    quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
-    return os_info;
-}
-cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
-{
-    cpu::AsmGemmInfo asm_info;
-    asm_info.method                  = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
-    asm_info.ps_info                 = info.conv_info;
-    asm_info.activation_info         = info.act_info;
-    asm_info.depth_output_gemm3d     = true;
-    asm_info.reinterpret_input_as_3d = true;
-    asm_info.padding_top             = info.conv_info.pad_top();
-    asm_info.padding_left            = info.conv_info.pad_left();
-    asm_info.padding_value           = 0.f;
-    asm_info.negated_offsets         = false;
-    asm_info.fast_mode               = info.enable_fast_math;
-    return asm_info;
-}
-} // namespace
-
-CpuGemmDirectConv2d::CpuGemmDirectConv2d()
-    : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>()),
-      _activation_func(std::make_unique<CpuActivation>()),
-      _weights_permute_func(std::make_unique<CpuPermute>()),
-      _aux_mem(AuxTensorIdx::Count),
-      _perm_weights(),
-      _run_activation(false),
-      _is_prepared(false)
-{
-}
-
-CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
-
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
-                                                             weights,
-                                                             biases != nullptr ? biases : nullptr,
-                                                             dst,
-                                                             info));
-    _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
-    _is_prepared    = false;
-
-    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
-
-    // Configure assembly dispatch
-    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    if(is_data_type_quantized(src->data_type()))
-    {
-        asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
-    }
-    _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
-
-    // Configure activation
-    if(_run_activation)
-    {
-        _activation_func->configure(dst, nullptr, info.act_info);
-    }
-
-    // Add auxiliary memory requirements of the assembly dispatch
-    auto asm_mem_req           = _gemm_asm_func->workspace();
-    _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-    _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
-
-    if(_aux_mem[Pretranspose].size > 0)
-    {
-        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
-    }
-    else
-    {
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
-    }
-}
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
-    const DataType    data_type = src->data_type();
-    const TensorShape i_shape   = src->tensor_shape();
-    const TensorShape w_shape   = weights->tensor_shape();
-    ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(data_type))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else if(data_type == DataType::BFLOAT16)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info));
-    return Status{};
-}
-void CpuGemmDirectConv2d::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    _gemm_asm_func->run(tensors);
-    if(_run_activation)
-    {
-        _activation_func->run(tensors);
-    }
-}
-
-void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
-        ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
-
-        CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
-        _weights_permute_func->run(permute_tensors);
-
-        tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
-        // Call prepare of assembly dispatch
-        _gemm_asm_func->prepare(tensors);
-
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
deleted file mode 100644
index 7fb20b3037..0000000000
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
-#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-struct Conv2dInfo;
-namespace cpu
-{
-class CpuGemmDirectConv2d : public ICpuOperator
-{
-public:
-    CpuGemmDirectConv2d();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d);
-    ~CpuGemmDirectConv2d();
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0           |src1           |src2           |dst            |
-     * |:--------------|:--------------|:--------------|:--------------|
-     * |QASYMM8        |QASYMM8        |S32            |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32            |QASYMM8_SIGNED |
-     * |F16            |F16            |F16            |F16            |
-     * |F32            |F32            |F32            |F32            |
-     * |BFLOAT16       |BFLOAT16       |BFLOAT16       |BFLOAT16       |
-     *
-     * @param[in] src     Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                    while every optional dimension from 4 and above represent a batch of inputs.
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases  Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                    Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] dst     Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                    Data types supported: Same as @p input.
-     * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
-     *
-     * Similar to CpuGemmDirectConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum AuxTensorIdx
-    {
-        AsmGemmWorkspace = 0,
-        Pretranspose,
-        PermutedWeights,
-        Count
-    };
-
-    std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func;
-    std::unique_ptr<CpuActivation>           _activation_func;
-    std::unique_ptr<CpuPermute>              _weights_permute_func;
-    experimental::MemoryRequirements         _aux_mem;
-    TensorInfo                               _perm_weights;
-    bool                                     _run_activation;
-    bool                                     _is_prepared;
-};
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
deleted file mode 100644
index 7affc3f506..0000000000
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ /dev/null
@@ -1,711 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-
-#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
-#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
-{
-    cpu::AsmGemmInfo asm_info;
-    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
-    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
-    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
-    asm_info.activation_info         = info.activation_info();
-    asm_info.output_stage            = info.gemmlowp_output_stage();
-    asm_info.fast_mode               = info.fast_math();
-
-    return asm_info;
-}
-} // namespace
-
-CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
-    : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
-      _mm_kernel(),
-      _mtx_a_reshape_kernel(),
-      _mtx_b_reshape_kernel(),
-      _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(),
-      _offset_contribution_output_stage_kernel(),
-      _activation_func(),
-      _convert_to_signed_asymm(),
-      _convert_from_signed_asymm(),
-      _vector_sum_col(),
-      _vector_sum_row(),
-      _tmp_a(),
-      _tmp_b(),
-      _mm_result_s32(),
-      _signed_a(),
-      _signed_output(),
-      _a_offset(0),
-      _b_offset(0),
-      _run_vector_matrix_multiplication(false),
-      _assembly_path(false),
-      _fused_assembly_path(false),
-      _reshape_b_only_on_first_run(false),
-      _is_prepared(false),
-      _fuse_output_stage(false),
-      _run_activation(false),
-      _flip_signedness(false),
-      _gemm_info(),
-      _aux_mem(Count)
-{
-}
-CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
-
-void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
-
-    const ITensorInfo *matrix_a = a;
-    const ITensorInfo *matrix_b = b;
-    GEMMInfo           info     = gemm_info;
-
-    // Set internal variables
-    _a_offset                         = a->quantization_info().uniform().offset;
-    _b_offset                         = b->quantization_info().uniform().offset;
-    _run_vector_matrix_multiplication = a->dimension(1) < 2;
-    _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
-    _is_prepared                      = false;
-    _fused_assembly_path              = false;
-    _flip_signedness                  = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
-    _gemm_info                        = gemm_info;
-
-    _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
-
-    const ITensorInfo *a_to_use = a;
-
-    // Convert to QASYMM8 -> QASYMM8_SIGNED and back
-    if(_flip_signedness)
-    {
-        const int32_t                 offset_correction = 128;
-        const DataType                dt                = DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
-
-        _signed_a                = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
-        _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
-        _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
-        a_to_use  = &_signed_a;
-        _a_offset = _signed_a.quantization_info().uniform().offset;
-
-        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
-
-        // Output stage correction
-        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
-        output_stage_corr.gemmlowp_offset         = _signed_output.quantization_info().uniform().offset;
-        output_stage_corr.gemmlowp_min_bound -= offset_correction;
-        output_stage_corr.gemmlowp_max_bound -= offset_correction;
-        info.set_gemmlowp_output_stage(output_stage_corr);
-
-        // Update matrix a
-        matrix_a = &_signed_a;
-    }
-
-    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-    {
-        _fuse_output_stage = true;
-        _mm_result_s32     = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
-    }
-
-    // Initialize assembly kernel meta-data
-    const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
-#ifdef __aarch64__
-    switch(a->data_type())
-    {
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::U8:
-        case DataType::S8:
-        {
-            if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-            {
-                auto c_info_to_use = c == nullptr ? nullptr : c;
-                _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
-                _fused_assembly_path = _asm_glue->is_configured();
-            }
-            else
-            {
-                auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
-                _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
-            }
-            _assembly_path = _asm_glue->is_configured();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Datatype not supported");
-            break;
-        }
-    }
-#endif /* __aarch64__ */
-    if(!(_assembly_path || _run_vector_matrix_multiplication))
-    {
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
-
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
-
-        // Configure interleave kernel
-        _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
-        _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
-
-        // Configure transpose kernel
-        _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
-        _mtx_b_reshape_kernel->configure(b, &_tmp_b);
-    }
-
-    if(!_fused_assembly_path)
-    {
-        // Build reduction info
-        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
-
-        // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
-        {
-            _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
-            // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
-            _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
-        }
-
-        // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
-        {
-            _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
-
-            // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
-            _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
-        }
-
-        if(_fuse_output_stage)
-        {
-            // Configure matrix multiply kernel
-            if(!_assembly_path)
-            {
-                _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
-                _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
-            }
-
-            _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
-            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
-                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                                _flip_signedness ? &_signed_output : dst,
-                                                                a->dimension(0),
-                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
-
-            if(_flip_signedness)
-            {
-                _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
-                _convert_from_signed_asymm->configure(&_signed_output, dst);
-            }
-        }
-        else
-        {
-            // Configure matrix multiply kernel
-            if(!_assembly_path)
-            {
-                _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
-                _mm_kernel->configure(matrix_a, matrix_b, dst);
-            }
-            // Configure offset contribution kernel
-            _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
-            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
-                                                   _a_offset, _b_offset);
-        }
-    }
-    // Configure activation
-    const ActivationLayerInfo &activation = gemm_info.activation_info();
-    _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
-    if(_run_activation)
-    {
-        _activation_func = std::make_unique<CpuActivation>();
-        _activation_func->configure(dst, nullptr, activation);
-    }
-
-    if(_assembly_path)
-    {
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
-    }
-
-    // Request memory for LHS and RHS reshape matrix
-    _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
-                                        && _reshape_b_only_on_first_run ?
-                                        MemoryLifetime::Persistent :
-                                        MemoryLifetime::Temporary,
-                                        _vector_sum_col.total_size());
-    _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    _aux_mem[TmpA]         = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[TmpB]         = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    _aux_mem[MMResultS32]  = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[SignedA]      = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
-    _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
-}
-
-Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    GEMMInfo           info          = gemm_info;
-    const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
-
-    const ITensorInfo *a_to_use = a;
-
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-    TensorInfo mm_result_s32_info{};
-
-    int32_t a_offset = a->quantization_info().uniform().offset;
-    int32_t b_offset = b->quantization_info().uniform().offset;
-
-    bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-    if(fuse_output_stage)
-    {
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
-    }
-
-    // Convert QASYMM8->QASYMM8_SIGNED
-    TensorInfo signed_a{};
-    TensorInfo signed_output{};
-    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
-    if(flip_signedness)
-    {
-        const int32_t                 offset_correction = 128;
-        const DataType                dt                = DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
-
-        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
-        a_to_use = &signed_a;
-        a_offset = signed_a.quantization_info().uniform().offset;
-
-        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
-
-        // Output stage correction
-        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
-        output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
-        output_stage_corr.gemmlowp_min_bound -= offset_correction;
-        output_stage_corr.gemmlowp_max_bound -= offset_correction;
-        info.set_gemmlowp_output_stage(output_stage_corr);
-
-        // Update matrix a
-        matrix_a_info = &signed_a;
-    }
-
-    // Initialize assembly kernel meta-data
-    const AsmGemmInfo asm_info = init_assembly_metadata(info);
-
-    // Check if we need to run the optimized assembly kernel
-    bool run_optimised             = false;
-    bool run_optimised_requantized = false;
-    if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        run_optimised             = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
-        run_optimised_requantized = run_optimised;
-    }
-    else
-    {
-        run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
-    }
-
-    if(run_optimised)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(info.depth_output_gemm3d() != 0)
-        {
-            if(info.reinterpret_input_as_3d())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-            }
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
-        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        if(!run_vector_matrix_multiplication)
-        {
-            matrix_a_info = &tmp_a_info;
-            matrix_b_info = &tmp_b_info;
-
-            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorShape shape_tmp_a = a->tensor_shape();
-            shape_tmp_a.set(0, a->dimension(0) * 4);
-            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorShape shape_tmp_b = b->tensor_shape();
-            shape_tmp_b.set(0, b->dimension(1) * 16);
-            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
-        }
-    }
-
-    if(!run_optimised_requantized)
-    {
-        TensorInfo info_vector_sum_col{};
-        TensorInfo info_vector_sum_row{};
-
-        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
-
-        // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-        if(a_offset != 0)
-        {
-            info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
-            // Configure Matrix B reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
-        }
-
-        // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(b_offset != 0)
-        {
-            info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-            // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
-        }
-
-        if(fuse_output_stage)
-        {
-            if(!run_optimised)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
-            }
-
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                          a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                          b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                          c,
-                                                                                                          flip_signedness ? &signed_output : output,
-                                                                                                          a_offset, b_offset,
-                                                                                                          info.gemmlowp_output_stage()));
-        }
-        else
-        {
-            if(!run_optimised)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
-            }
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
-                                                                                               a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                               b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                               a_offset, b_offset));
-        }
-    }
-
-    // Validate activation
-    const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
-    }
-
-    return Status{};
-}
-
-void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto a        = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto b        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto c        = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto dst      = tensors.get_tensor(TensorType::ACL_DST);
-    auto a_to_use = a;
-    auto matrix_a = a;
-    auto matrix_b = b;
-
-    CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
-    CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
-    CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
-    CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
-    CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
-    CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
-    CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
-
-    // Convert QASYMM8->QASYMM8_SIGNED
-    if(_flip_signedness)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, a },
-            { TensorType::ACL_DST, signed_a.get() }
-        };
-        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
-        a_to_use = signed_a.get();
-        matrix_a = signed_a.get();
-    }
-
-    // Run GEMM
-    if(_asm_glue->is_configured())
-    {
-        ITensorPack asm_glue_tensors = tensors;
-        auto        output_to_use    = (_fuse_output_stage ? mm_result_s32.get() : dst);
-        if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-        {
-            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
-            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
-            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
-            asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
-        }
-        else
-        {
-            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
-            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
-            asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
-        }
-        _asm_glue->run(asm_glue_tensors);
-    }
-    else
-    {
-        if(!_run_vector_matrix_multiplication)
-        {
-            matrix_a = tmp_a.get();
-            matrix_b = tmp_b.get();
-            // Run interleave kernel
-            ITensorPack pack_a =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, tmp_a.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
-
-            if(!_reshape_b_only_on_first_run)
-            {
-                ITensorPack pack_b =
-                {
-                    { TensorType::ACL_SRC, b },
-                    { TensorType::ACL_DST, tmp_b.get() }
-                };
-                // Run transpose kernel
-                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
-            }
-        }
-        ITensorPack pack_mm =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b }
-        };
-        if(_fuse_output_stage)
-        {
-            pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
-        }
-        else
-        {
-            pack_mm.add_tensor(TensorType::ACL_DST, dst);
-        }
-        NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
-    }
-
-    if(!_fused_assembly_path)
-    {
-        // Run matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
-        {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, vector_sum_row.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
-        }
-
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
-        {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
-        }
-
-        if(_fuse_output_stage)
-        {
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
-            pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
-            pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
-            pack.add_tensor(TensorType::ACL_SRC_3, c);
-            pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
-
-            // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
-        }
-        else
-        {
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
-            pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
-            pack.add_tensor(TensorType::ACL_DST, dst);
-
-            // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
-        }
-    }
-
-    // Convert QASYMM8_SIGNED->QASYMM8
-    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, signed_output.get() },
-            { TensorType::ACL_DST, dst }
-        };
-        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
-    }
-
-    // Run fused activation unless already run in the fused assembly
-    if(_run_activation)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
-        _activation_func->run(pack);
-    }
-}
-
-void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        // Run assembly reshape
-        if(_asm_glue->is_configured())
-        {
-            _asm_glue->prepare(tensors);
-        }
-        // Run non-assembly reshape
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
-        {
-            // Run reshape kernel and mark original weights tensor as unused
-            ITensor            *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
-            CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
-        }
-
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
-        {
-            ITensor            *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
-            CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
-        }
-        _is_prepared = true;
-    }
-}
-experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
deleted file mode 100644
index 1d0e470559..0000000000
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-class CpuGemmInterleave4x4Kernel;
-class CpuGemmLowpMatrixMultiplyKernel;
-class CpuGemmLowpOffsetContributionKernel;
-class CpuGemmLowpOffsetContributionOutputStageKernel;
-class CpuGemmLowpMatrixAReductionKernel;
-class CpuGemmLowpMatrixBReductionKernel;
-class CpuGemmTranspose1xWKernel;
-class CpuConvertQuantizedSignednessKernel;
-} // namespace kernels
-class CpuGemmAssemblyDispatch;
-class CpuActivation;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
- *
- *  -# @ref kernels::CpuGemmInterleave4x4Kernel
- *  -# @ref kernels::CpuGemmTranspose1xWKernel
- *  -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel
- *  -# @ref kernels::CpuGemmLowpOffsetContributionKernel
- *  -# @ref CpuActivation
- *
- * otherwise if the DOT product instruction is available:
- *
- *  -# @ref kernels::CpuGemmLowpOffsetContributionKernel
- *
-*/
-class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuGemmLowpMatrixMultiplyCore();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore);
-    /** Destructor */
-    ~CpuGemmLowpMatrixMultiplyCore();
-    /** Initialise the kernel's inputs, output
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2     |dst            |
-     * |:--------------|:------------------|:--------|:--------------|
-     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
-     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
-     * |QASYMM8        |QASYMM8            |S32      |S32            |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
-     * |QASYMM8        |QSYMM8             |S32      |S32            |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
-     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
-     *
-     * @note GEMM_LOWP:  low precision GEMM kernel
-     *  This kernel performs the following computations:
-     *
-     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-     *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
-     *  -# Compute the matrix product of the resulting a * b in int32.
-     *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
-     *
-     * @param[in]  a         First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in]  c         Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[out] dst       Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should be executed only for the first run
-     */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum AuxTensorIdx
-    {
-        AsmGemmWorkspace = 0,
-        Pretranspose,
-        VectorSumCol,
-        VectorSumRow,
-        TmpA,
-        TmpB,
-        MMResultS32,
-        SignedA,
-        SignedOutput,
-        Count
-    };
-
-    std::unique_ptr<CpuGemmAssemblyDispatch>                                 _asm_glue;
-    std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel>                _mm_kernel;
-    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
-    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>                      _mtx_b_reshape_kernel;
-    std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-    std::unique_ptr<CpuActivation>                                           _activation_func;
-    std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
-    std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
-
-    TensorInfo _vector_sum_col;
-    TensorInfo _vector_sum_row;
-    TensorInfo _tmp_a;
-    TensorInfo _tmp_b;
-    TensorInfo _mm_result_s32;
-    TensorInfo _signed_a;
-    TensorInfo _signed_output;
-    int32_t    _a_offset;
-    int32_t    _b_offset;
-
-    bool                             _run_vector_matrix_multiplication;
-    bool                             _assembly_path;
-    bool                             _fused_assembly_path;
-    bool                             _reshape_b_only_on_first_run;
-    bool                             _is_prepared;
-    bool                             _fuse_output_stage;
-    bool                             _run_activation;
-    bool                             _flip_signedness;
-    GEMMInfo                         _gemm_info;
-    experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
diff --git a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp
deleted file mode 100644
index e17f854a21..0000000000
--- a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(info.output_data_type)
-            {
-                case DataType::QASYMM8:
-                {
-                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QSYMM16:
-                {
-                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                default:
-                {
-                    ARM_COMPUTE_ERROR("Unsupported output data type.");
-                    break;
-                }
-            }
-            break;
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-        {
-            switch(info.output_data_type)
-            {
-                case DataType::QASYMM8:
-                case DataType::QASYMM8_SIGNED:
-                {
-                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel>();
-                    k->configure(src, bias, dst, &info);
-                    _kernel = std::move(k);
-                    break;
-                }
-                default:
-                {
-                    ARM_COMPUTE_ERROR("Unsupported output data type.");
-                    break;
-                }
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
-    }
-}
-
-Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(dst->data_type())
-            {
-                case DataType::QASYMM8:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QASYMM8_SIGNED:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QSYMM16:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                default:
-                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
-            }
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-        {
-            switch(dst->data_type())
-            {
-                case DataType::QASYMM8:
-                case DataType::QASYMM8_SIGNED:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
-                default:
-                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
-            }
-        }
-        default:
-            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
-    }
-}
-
-void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
-{
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.h b/src/runtime/cpu/operators/CpuGemmLowpOutputStage.h
deleted file mode 100644
index bed88a60d5..0000000000
--- a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
-
-#include "arm_compute/core/Types.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-/** This file contains all available output stages for GEMMLowp.
- *
- *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
- *  and processes it to obtain the final ASYMM8 value.
- *
- *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
- */
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to execute GEMMLowpQuantizeDown kernels.
- *
- *  This function calls the following kernels:
- *
- * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel
- * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-*/
-class CpuGemmLowpOutputStage : public ICpuOperator
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0           |src1          |dst           |
-     * |:--------------|:-------------|:-------------|
-     * |S32            |S32           |QASYMM8       |
-     * |S32            |S32           |QASYMM8_SIGNED|
-     * |S32            |S32           |QSYMM16       |
-     *
-     * @param[in]  src  Input tensor info. Data type supported: S32
-     * @param[in]  bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                  Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] dst  Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
-     * @param[in]  info GEMMLowp output stage metadata.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpOutputStage::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/runtime/cpu/operators/CpuMul.cpp b/src/runtime/cpu/operators/CpuMul.cpp
deleted file mode 100644
index 2f3d442a70..0000000000
--- a/src/runtime/cpu/operators/CpuMul.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuMul.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuMulKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
-                        const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
-}
-
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
-                       const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = std::make_unique<kernels::CpuMulKernel>();
-    k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy);
-    _kernel = std::move(k);
-}
-
-void CpuMul::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
-}
-
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = std::make_unique<kernels::CpuComplexMulKernel>();
-    k->configure(src1, src2, dst);
-    _kernel = std::move(k);
-}
-
-void CpuComplexMul::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuMul.h b/src/runtime/cpu/operators/CpuMul.h
deleted file mode 100644
index da518c4461..0000000000
--- a/src/runtime/cpu/operators/CpuMul.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_MUL_H
-#define ARM_COMPUTE_CPU_MUL_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuMulKernel */
-class CpuMul : public ICpuOperator
-{
-public:
-    /** Initialise the kernel's inputs, dst and convertion policy.
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in, out] src1            First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2            Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32).
-     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst             dst tensor info. Data types supported:
-     *                                 - U8, only if both inputs are U8.
-     *                                 - QASYMM8, only if both inputs are QASYMM8.
-     *                                 - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED.
-     *                                 - S16.
-     *                                 - QSYMM16, only if both inputs are QSYMM16.
-     *                                 - S32, only if both inputs are S32 or both are QSYMM16.
-     *                                 - F16, only if @p src1 is F16.
-     *                                 - F32, only if both inputs are F32.
-     * @param[in]      scale           Scale to apply after multiplication.
-     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     *                                 If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
-     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
-     * @param[in]      rounding_policy Rounding policy.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuMul::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-
-/** Basic function to run @ref kernels::CpuComplexMulKernel */
-class CpuComplexMul : public ICpuOperator
-{
-public:
-    /** Initialise the kernel's inputs, dst.
-     *
-     * @param[in, out] src1     First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2     Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst      The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuComplexMul::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_MUL_H */
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuPRelu.h b/src/runtime/cpu/operators/CpuPRelu.h
deleted file mode 100644
index a6859f95d9..0000000000
--- a/src/runtime/cpu/operators/CpuPRelu.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PRELU_H
-#define ARM_COMPUTE_CPU_PRELU_H
-
-#include "src/runtime/cpu/operators/CpuElementwise.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */
-using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_PRELU_H */
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuPermute.cpp b/src/runtime/cpu/operators/CpuPermute.cpp
deleted file mode 100644
index 7fde1e3767..0000000000
--- a/src/runtime/cpu/operators/CpuPermute.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuPermute.h"
-
-#include "src/core/cpu/kernels/CpuPermuteKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
-    auto k = std::make_unique<kernels::CpuPermuteKernel>();
-    k->configure(src, dst, perm);
-    _kernel = std::move(k);
-}
-
-Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    return kernels::CpuPermuteKernel::validate(src, dst, perm);
-}
-} // namesapce cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPermute.h b/src/runtime/cpu/operators/CpuPermute.h
deleted file mode 100644
index 2500017c0e..0000000000
--- a/src/runtime/cpu/operators/CpuPermute.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PERMUTE_H
-#define ARM_COMPUTE_CPU_PERMUTE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuPermuteKernel */
-class CpuPermute : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in]  src  Source tensor to permute. Data types supported: All
-     * @param[out] dst  Destintation tensor. Data types supported: Same as @p src
-     * @param[in]  perm Permutation vector
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuPermute::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_PERMUTE_H */
diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp
deleted file mode 100644
index e746c8fb3b..0000000000
--- a/src/runtime/cpu/operators/CpuPool2d.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuPool2d.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/cpu/kernels/CpuPool2dKernel.h"
-#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace cpu
-{
-CpuPool2d::CpuPool2d()
-    : _pooling_layer_kernel(),
-      _border_handler(),
-      _asm_glue(),
-      _is_global_pooling_layer(false),
-      _data_layout(DataLayout::NCHW),
-      _aux_mem(1)
-{
-}
-
-CpuPool2d::~CpuPool2d() = default;
-
-void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
-    // Get data layout
-    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-
-    // Check if we have Global Pooling Layer
-    const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-
-    if(run_optimised)
-    {
-        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-        const unsigned int num_threads = NEScheduler::get().num_threads();
-
-        auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
-        ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-        pooling_wrapper->configure(src, dst, pool_info, ci);
-
-        // Get kernel's memory requirements
-        constexpr size_t alignment      = 4096;
-        const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
-        _aux_mem[0]                     = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
-
-        _asm_glue = std::move(pooling_wrapper);
-    }
-    else
-    {
-        // Configure pooling kernel
-        auto k = std::make_unique<kernels::CpuPool2dKernel>();
-        k->configure(src, dst, pool_info, indices);
-        _pooling_layer_kernel = std::move(k);
-
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-            {
-                // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-                BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-                PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
-                if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
-                {
-                    zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-                }
-                auto b = std::make_unique<NEFillBorderKernel>();
-                b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
-                _border_handler = std::move(b);
-                break;
-            }
-            case DataLayout::NHWC:
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
-}
-
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
-
-    if(run_optimised)
-    {
-        return Status{};
-    }
-
-    return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
-}
-
-void CpuPool2d::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
-
-    if(_asm_glue)
-    {
-        const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
-        NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
-    }
-    else
-    {
-        switch(_data_layout)
-        {
-            case DataLayout::NCHW:
-                // Fill border
-                NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
-
-                // Run pooling layer
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
-                break;
-            case DataLayout::NHWC:
-                // Run pooling layer
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data layout not supported");
-        }
-    }
-}
-
-experimental::MemoryRequirements CpuPool2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h
deleted file mode 100644
index 7feff91612..0000000000
--- a/src/runtime/cpu/operators/CpuPool2d.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_H
-#define ARM_COMPUTE_CPU_POOL2D_H
-
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward Declarations
-struct PoolingLayerInfo;
-
-namespace cpu
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
- *
- * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref kernels::CpuPool2dKernel
- * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
- */
-class CpuPool2d : public ICpuOperator
-{
-public:
-    CpuPool2d();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
-    ~CpuPool2d();
-    /** Set the src and dst tensors.
-     *
-     * @note F16 is supported for pool sizes 2 and 3 only
-     *
-     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
-     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuPool2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    std::unique_ptr<INEKernel> _pooling_layer_kernel;
-    std::unique_ptr<INEKernel> _border_handler;
-    std::unique_ptr<INEKernel> _asm_glue;
-
-    bool                             _is_global_pooling_layer;
-    DataLayout                       _data_layout;
-    experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/runtime/cpu/operators/CpuQuantize.cpp b/src/runtime/cpu/operators/CpuQuantize.cpp
deleted file mode 100644
index 5af7f6343b..0000000000
--- a/src/runtime/cpu/operators/CpuQuantize.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/runtime/cpu/operators/CpuQuantize.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst));
-    return Status{};
-}
-
-void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Configure quantize kernel
-    auto k = std::make_unique<kernels::CpuQuantizeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-void CpuQuantize::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuQuantize.h b/src/runtime/cpu/operators/CpuQuantize.h
deleted file mode 100644
index 9a34a36bcc..0000000000
--- a/src/runtime/cpu/operators/CpuQuantize.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_H
-#define ARM_COMPUTE_CPU_QUANTIZE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */
-class CpuQuantize : public ICpuOperator
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuQuantize::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */
diff --git a/src/runtime/cpu/operators/CpuReshape.cpp b/src/runtime/cpu/operators/CpuReshape.cpp
deleted file mode 100644
index 33c9cb87b6..0000000000
--- a/src/runtime/cpu/operators/CpuReshape.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuReshape.h"
-
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuReshapeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuReshapeKernel::validate(src, dst);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuReshape.h b/src/runtime/cpu/operators/CpuReshape.h
deleted file mode 100644
index 581b55e0ef..0000000000
--- a/src/runtime/cpu/operators/CpuReshape.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_RESHAPE_H
-#define ARM_COMPUTE_CPU_RESHAPE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuReshapeKernel */
-class CpuReshape : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  src Source tensor info. Data type supported: All
-     * @param[out] dst Destination info. Data type supported: Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuReshape::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_RESHAPE_H */
diff --git a/src/runtime/cpu/operators/CpuScale.cpp b/src/runtime/cpu/operators/CpuScale.cpp
deleted file mode 100644
index 475cb2d4e8..0000000000
--- a/src/runtime/cpu/operators/CpuScale.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuScale.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuScaleKernel.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
-{
-    ARM_COMPUTE_ERROR_ON(offsets == nullptr);
-    float sampling_offset = 0.0f;
-    if(sampling_policy == SamplingPolicy::CENTER)
-    {
-        sampling_offset = 0.5f;
-    }
-
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
-    win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
-
-    if(dx != nullptr && dy != nullptr)
-    {
-        // Pre-compute the offset and pixel's distance for BILINEAR interpolation
-        Iterator offsets_it(offsets, win);
-        Iterator dx_it(dx, win);
-        Iterator dy_it(dy, win);
-
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
-            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
-            const int   in_xi = std::floor(in_x);
-            const int   in_yi = std::floor(in_y);
-
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
-            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
-        },
-        offsets_it, dx_it, dy_it);
-    }
-    else
-    {
-        // Pre-compute the offset for NEAREST interpolation
-        Iterator offsets_it(offsets, win);
-
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float float_in_xi                        = (id.x() + sampling_offset) * wr;
-            const auto  in_xi                              = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-        },
-        offsets_it);
-    }
-}
-} // namespace
-
-void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info));
-
-    _scale_info  = info;
-    _is_prepared = false;
-
-    // Get data layout and width/height indices
-    _data_layout         = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                         && hr <= 1.f) ?
-                                        InterpolationPolicy::NEAREST_NEIGHBOR :
-                                        _scale_info.interpolation_policy;
-
-    // Get the tensor shape
-    TensorShape shape(dst->dimension(idx_width));
-    shape.set(1, dst->dimension(idx_height), false);
-
-    TensorInfo tensor_info_offsets(shape, Format::S32);
-    TensorInfo tensor_info_dxdy(shape, Format::F32);
-
-    auto dx           = std::make_unique<TensorInfo>(tensor_info_dxdy);
-    auto dy           = std::make_unique<TensorInfo>(tensor_info_dxdy);
-    auto offsets      = std::make_unique<TensorInfo>(tensor_info_offsets);
-    auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
-    switch(policy_to_use)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info);
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info);
-            break;
-        }
-        case InterpolationPolicy::AREA:
-        {
-            scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
-    }
-    _kernel = std::move(scale_kernel);
-}
-
-Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-
-    ITensorInfo *offsets = nullptr;
-    ITensorInfo *dx      = nullptr;
-    ITensorInfo *dy      = nullptr;
-
-    // Get data layout and width/height indices
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
-
-    // Get the tensor shape of auxilary buffers
-    const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
-    TensorInfo        tensor_info_offsets(shape, Format::S32);
-    TensorInfo        tensor_info_dx(shape, Format::F32);
-    TensorInfo        tensor_info_dy(shape, Format::F32);
-    switch(policy_to_use)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-            offsets = &tensor_info_offsets;
-            break;
-        case InterpolationPolicy::BILINEAR:
-            offsets = &tensor_info_offsets;
-            dx      = &tensor_info_dx;
-            dy      = &tensor_info_dy;
-            break;
-        default:
-            break;
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
-    return Status{};
-}
-
-void CpuScale::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        _is_prepared       = true;
-        const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
-        auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
-        auto       dx      = tensors.get_tensor(TensorType::ACL_INT_0);
-        auto       dy      = tensors.get_tensor(TensorType::ACL_INT_1);
-        auto       offsets = tensors.get_tensor(TensorType::ACL_INT_2);
-
-        // Get data layout and width/height indices
-        const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-        const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-        // Compute the ratio between source width/height and destination width/height
-        const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-        const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
-        const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
-
-        // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-        InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                             && hr <= 1.f) ?
-                                            InterpolationPolicy::NEAREST_NEIGHBOR :
-                                            _scale_info.interpolation_policy;
-        const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
-
-        switch(policy_to_use)
-        {
-            case InterpolationPolicy::NEAREST_NEIGHBOR:
-            {
-                // Pre-compute offsets for nearest interpolation
-                precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used);
-                break;
-            }
-            case InterpolationPolicy::BILINEAR:
-            {
-                // Pre-compute dx, dy and offsets for bilinear interpolation
-                precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used);
-                break;
-            }
-            case InterpolationPolicy::AREA:
-            {
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Unsupported interpolation mode");
-        }
-    }
-}
-
-void CpuScale::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    prepare(tensors);
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuScale.h b/src/runtime/cpu/operators/CpuScale.h
deleted file mode 100644
index b83e04bc42..0000000000
--- a/src/runtime/cpu/operators/CpuScale.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SCALE_H
-#define ARM_COMPUTE_CPU_SCALE_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to compute Scale */
-class CpuScale : public ICpuOperator
-{
-public:
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
-     * @param[in, out] src  Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     dst  Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]      info @ref ScaleKernelInfo to be used for configuration
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuScale::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    void prepare(ITensorPack &tensors) override;
-    void run(ITensorPack &tensors) override;
-
-private:
-    ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
-    DataLayout      _data_layout{ DataLayout::UNKNOWN };
-    bool            _is_prepared{ false };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SCALE_H */
diff --git a/src/runtime/cpu/operators/CpuSoftmax.cpp b/src/runtime/cpu/operators/CpuSoftmax.cpp
deleted file mode 100644
index abbc539b19..0000000000
--- a/src/runtime/cpu/operators/CpuSoftmax.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuSoftmax.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <bool IS_LOG>
-CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
-    : _permute_input(),
-      _permute_output(),
-      _max_kernel(),
-      _softmax_kernel(),
-      _max(),
-      _tmp(),
-      _input_permuted(),
-      _output_permuted(),
-      _needs_permute(false),
-      _aux_mem(InternalTensorIdx::COUNT)
-{
-}
-
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
-{
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
-
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
-
-    _needs_permute = actual_axis > 0;
-
-    if(_needs_permute)
-    {
-        _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
-    }
-
-    // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
-    // or it is the original input case (2D case)
-    const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
-
-    // Create intermediate tensors shapes
-    TensorShape max_sum_shape = tmp_input->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo input_info    = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo       max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
-
-    // Init intermediate tensors
-    _max = TensorInfo(max_info);
-    _tmp = TensorInfo(tensor_info_tmp);
-
-    // Configure kernels
-    auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
-    mk->configure(tmp_input, &_max);
-    _max_kernel = std::move(mk);
-
-    auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
-    if(_needs_permute)
-    {
-        // The normalization kernel stores the result in a permuted output tensor
-        sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
-
-        // Re-permute the permuted output into the requested (4D) output
-        _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
-    }
-    else
-    {
-        // Softmax 2D case
-        sm->configure(tmp_input, &_max, dst, beta, &_tmp);
-    }
-    _softmax_kernel = std::move(sm);
-
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
-
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
-}
-
-template <bool IS_LOG>
-Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
-{
-    // Perform validation step
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
-
-    // Create intermediate tensor info
-    DataType         tmp_data_type = src->data_type();
-    const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
-    const TensorInfo dont_care;
-
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
-
-    const bool needs_permute = actual_axis > 0;
-
-    if(needs_permute)
-    {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
-        TensorInfo              input_permuted(src->clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
-        TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
-
-    return Status{};
-}
-
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
-    CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
-
-    CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
-    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
-
-    ITensorPack max_pack;
-    ITensorPack softmax_pack;
-
-    if(_needs_permute)
-    {
-        ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
-        _permute_input.run(permute_in_pack);
-
-        max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
-
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, input_permuted.get() },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, output_permuted.get() },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
-    }
-    else
-    {
-        max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
-
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, src },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, dst },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
-    }
-
-    NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
-    NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
-
-    if(_needs_permute)
-    {
-        ITensorPack permute_out_pack;
-        permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
-        permute_out_pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output.run(permute_out_pack);
-    }
-}
-
-template <bool                   IS_LOG>
-experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
-{
-    return _aux_mem;
-}
-
-template class CpuSoftmaxGeneric<false>;
-template class CpuSoftmaxGeneric<true>;
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuSoftmax.h b/src/runtime/cpu/operators/CpuSoftmax.h
deleted file mode 100644
index a9ac803c09..0000000000
--- a/src/runtime/cpu/operators/CpuSoftmax.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
-
-/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
- *
- * Softmax is calculated by :
- * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
- *
- * Log Softmax is calculated by :
- * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
- *
- * This function runs the following function/kernels:
- * -# If axis is not 0:
- * -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
- */
-template <bool IS_LOG = false>
-class CpuSoftmaxGeneric : public ICpuOperator
-{
-public:
-    CpuSoftmaxGeneric();
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     *                     last value of each row to the nearest multiple.
-     * @param[out]    dst  Destination tensor ifo. Data types supported: same as @p input.
-     * @param[in]     beta (Optional) A scaling factor for the exponent.
-     * @param[in]     axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
-     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuSoftmaxGeneric::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum InternalTensorIdx
-    {
-        MAX = 0,
-        TMP,
-        PERMUTED_SRC,
-        PERMUTED_DST,
-        COUNT
-    };
-
-    CpuPermute                  _permute_input;
-    CpuPermute                  _permute_output;
-    std::unique_ptr<ICpuKernel> _max_kernel;
-    std::unique_ptr<ICpuKernel> _softmax_kernel;
-
-    TensorInfo _max;
-    TensorInfo _tmp;
-    TensorInfo _input_permuted;
-    TensorInfo _output_permuted;
-
-    bool                             _needs_permute;
-    experimental::MemoryRequirements _aux_mem{};
-};
-using CpuSoftmax    = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
-
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
diff --git a/src/runtime/cpu/operators/CpuSub.cpp b/src/runtime/cpu/operators/CpuSub.cpp
deleted file mode 100644
index 9baaaa9d67..0000000000
--- a/src/runtime/cpu/operators/CpuSub.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuSub.h"
-
-#include "src/core/cpu/kernels/CpuSubKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = std::make_unique<kernels::CpuSubKernel>();
-    k->configure(src0, src1, dst, policy);
-    _kernel = std::move(k);
-}
-
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuSub.h b/src/runtime/cpu/operators/CpuSub.h
deleted file mode 100644
index 07f5be89cd..0000000000
--- a/src/runtime/cpu/operators/CpuSub.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SUB_H
-#define ARM_COMPUTE_CPU_SUB_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuSubKernel */
-class CpuSub : public ICpuOperator
-{
-public:
-    /** Initialise the kernel's inputs, dst and conversion policy.
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)                          -> U8
-     *   - (QASYMM8, QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,S16)                        -> S16
-     *   - (S32,S32)                        -> S32
-     *   - (F16,F16)                        -> F16
-     *   - (F32,F32)                        -> F32
-     *
-     * @param[in]  src0     First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[in]  src1     Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[out] dst      Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuSub::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SUB_H */
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuTranspose.cpp b/src/runtime/cpu/operators/CpuTranspose.cpp
deleted file mode 100644
index 51eeb90b8b..0000000000
--- a/src/runtime/cpu/operators/CpuTranspose.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuTranspose.h"
-
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::CpuTransposeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
-}
-
-Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::CpuTransposeKernel::validate(src, dst);
-}
-} // namesapce cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuTranspose.h b/src/runtime/cpu/operators/CpuTranspose.h
deleted file mode 100644
index 0735924839..0000000000
--- a/src/runtime/cpu/operators/CpuTranspose.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H
-#define ARM_COMPUTE_CPU_TRANSPOSE_H
-
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Basic function to run @ref kernels::CpuTransposeKernel */
-class CpuTranspose : public ICpuOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  src Source tensor to permute. Data types supported: All
-     * @param[out] dst Destintation tensor. Data types supported: Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuTranspose::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */
diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
deleted file mode 100644
index 253280a951..0000000000
--- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
-#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-#include "support/Cast.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::experimental;
-using namespace arm_compute::utils::cast;
-
-namespace
-{
-arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
-{
-    switch(act_info.activation())
-    {
-        case ActivationLayerInfo::ActivationFunction::RELU:
-        {
-            return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
-        }
-        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-        {
-            return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
-        }
-        default:
-        {
-            return arm_gemm::Activation(arm_gemm::Activation::Type::None);
-        }
-    }
-}
-
-inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    if(src->data_type() == DataType::F32)
-    {
-        if(input_dims.width > 4 && input_dims.height > 4)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 4, 4, 3, 3>::validate(src, input0, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 2, 3, 3>::validate(src, input0, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
-        }
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(src->data_type() == DataType::F16)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(src, input0, winograd_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
-    }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_5x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 2, 5, 5>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, dst, winograd_info)));
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_3x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 6, 1, 3>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_1x3(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 6, 1, 3, 1>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
-
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_5x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 4, 1, 5>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, dst, winograd_info)));
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-inline Status validate_kernel_1x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 4, 1, 5, 1>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_7x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 2, 1, 7>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, dst, winograd_info)));
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_1x7(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 1, 7, 1>::validate(src, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
-
-    if(act_info.enabled())
-    {
-        CpuActivation::validate(dst, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Tensor4DShape internal_get_input_shape(const ITensorInfo *src)
-{
-    const DataLayout data_layout = src->data_layout();
-    const int        in_width    = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
-    const int        in_height   = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
-    const int        in_channels = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const int        in_batches  = src->dimension(3);
-
-    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_UNUSED(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-    return ICpuWinogradConv2dTransformWeightsKernel::validate(src, weights);
-}
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
-{
-    Size2D output_tile = Size2D{};
-    if(kernel_dims == Size2D(3U, 3U))
-    {
-        output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
-        if(data_type == DataType::F16)
-        {
-            output_tile = Size2D(4U, 4U);
-        }
-    }
-    else if(kernel_dims == Size2D(5U, 5U))
-    {
-        output_tile = Size2D(2U, 2U);
-    }
-    else if(kernel_dims == Size2D(1U, 3U))
-    {
-        output_tile = Size2D(1U, 6U);
-    }
-    else if(kernel_dims == Size2D(3U, 1U))
-    {
-        output_tile = Size2D(6U, 1U);
-    }
-    else if(kernel_dims == Size2D(1U, 5U))
-    {
-        output_tile = Size2D(1U, 4U);
-    }
-    else if(kernel_dims == Size2D(5U, 1U))
-    {
-        output_tile = Size2D(4U, 1U);
-    }
-    else if(kernel_dims == Size2D(7U, 1U))
-    {
-        output_tile = Size2D(2U, 1U);
-    }
-    else if(kernel_dims == Size2D(1U, 7U))
-    {
-        output_tile = Size2D(1U, 2U);
-    }
-    return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
-{
-    // Check if we want to configure a Winograd configuration which requires fast math
-    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
-    {
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
-    };
-
-    const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
-    {
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
-    };
-
-    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
-                            std::pair<int, int>(kernel_size.width, kernel_size.height));
-
-    switch(data_type)
-    {
-        case DataType::F16:
-            return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
-        case DataType::F32:
-            return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
-        default:
-            return false;
-    }
-}
-
-inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
-{
-    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
-}
-
-} // namespace
-
-CpuWinogradConv2d::CpuWinogradConv2d()
-    : _gemm_function(std::make_unique<CpuGemm>()),
-      _activation_func(std::make_unique<CpuActivation>()),
-      _permute_input(std::make_unique<CpuPermute>()),
-      _permute_output(std::make_unique<CpuPermute>()),
-      _permute_weights(std::make_unique<CpuPermute>()),
-      _transform_input_kernel(nullptr),
-      _transform_weights_kernel(nullptr),
-      _transform_output_kernel(nullptr),
-      _data_layout(),
-      _aux_mem(AuxTensorIdx::Count),
-      _input_nhwc(),
-      _output_nhwc(),
-      _input_workspace(),
-      _kernel_storage(),
-      _output_workspace(),
-      _input_transformed(),
-      _output_transformed(),
-      _weights_hwio(),
-      _run_activation(false),
-      _is_prepared(false)
-{
-}
-
-CpuWinogradConv2d::~CpuWinogradConv2d() = default;
-
-void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info));
-
-    // Get indices for the width and height
-    _data_layout                   = src->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
-    const Size2D   input_dims  = Size2D(src->dimension(width_idx), src->dimension(height_idx));
-    const Size2D   kernel_size = Size2D(weights->dimension(width_idx), weights->dimension(height_idx));
-    const DataType data_type   = src->data_type();
-    const Size2D   output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
-                                 "This Winograd configuration requires enable_fast_math=true");
-    }
-
-    _is_prepared = false;
-
-    std::unique_ptr<ICpuWinogradConv2dTransformInputKernel>   transform_input_kernel;
-    std::unique_ptr<ICpuWinogradConv2dTransformWeightsKernel> transform_weights_kernel;
-    std::unique_ptr<ICpuWinogradConv2dTransformOutputKernel>  transform_output_kernel;
-
-    int n_gemms = 1;
-    int N_BLOCK = 1; // Size of block used by GEMM.
-    if(data_type == DataType::F32)
-    {
-        if(kernel_size == Size2D(3, 3))
-        {
-            if(src->dimension(width_idx) > 4 && src->dimension(height_idx) > 4)
-            {
-                using config             = CpuWinogradConv2dConfiguration<float, float, 4, 4, 3, 3>;
-                transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-                transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-                transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-                n_gemms                  = config::WinogradBase::N_GEMMS;
-                N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            }
-            else
-            {
-                using config             = CpuWinogradConv2dConfiguration<float, float, 2, 2, 3, 3>;
-                transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-                transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-                transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-                n_gemms                  = config::WinogradBase::N_GEMMS;
-                N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            }
-        }
-        else if(kernel_size == Size2D(5, 5))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 2, 2, 5, 5>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(1, 3))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 6, 1, 3, 1>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(3, 1))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 1, 6, 1, 3>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(1, 5))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 4, 1, 5, 1>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(5, 1))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 1, 4, 1, 5>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(1, 7))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 2, 1, 7, 1>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(7, 1))
-        {
-            using config             = CpuWinogradConv2dConfiguration<float, float, 1, 2, 1, 7>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-        }
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(data_type == DataType::F16)
-    {
-        if(kernel_size == Size2D(3, 3))
-        {
-            using config             = CpuWinogradConv2dConfiguration<__fp16, __fp16, 4, 4, 3, 3>;
-            transform_input_kernel   = std::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = std::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-        }
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported.");
-    }
-
-    const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
-    const bool        use_same_padding = use_padding_type == PADDING_SAME;
-
-    // Get convolved dimensions
-    const int in_channels  = src->dimension(channel_idx);
-    const int out_channels = dst->dimension(channel_idx);
-
-    const Tensor4DShape in_shape(internal_get_input_shape(src));
-    const size_t        data_type_size = src->element_size();
-    // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t storage_alignment = 64;
-
-    // Kernel Storage
-    const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
-
-    // Input storage
-    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
-
-    // Output storage
-    const size_t output_storage_size  = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
-    const int    kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
-    const int    output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
-    const auto   output_shape         = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
-    const int    input_matrix_stride  = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
-
-    // Configure GEMM
-    const int tile_rows                = iceildiv(output_shape.first, output_tile.height);
-    const int tile_cols                = iceildiv(output_shape.second, output_tile.width);
-    const int m                        = in_shape.n_batches * tile_rows * tile_cols;
-    const int k                        = in_shape.n_channels;
-    const int n                        = out_channels;
-    const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
-    const int output_matrix_row_stride = kernel_matrix_row_stride;
-
-    TensorShape a_shape(k, m, 1, n_gemms);
-    Strides     a_strides(data_type_size);
-    a_strides.set(1, a_strides[0] * k);
-    //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
-    a_strides.set(2, 0);
-    a_strides.set(3, data_type_size * input_matrix_stride);
-
-    TensorShape b_shape(n, k, n_gemms);
-    Strides     b_strides(data_type_size);
-    b_strides.set(1, data_type_size * kernel_matrix_row_stride);
-    b_strides.set(2, data_type_size * kernel_matrix_stride);
-
-    TensorShape d_shape(n, m, 1, n_gemms);
-    Strides     d_strides(data_type_size);
-    d_strides.set(1, data_type_size * output_matrix_row_stride);
-    //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
-    d_strides.set(2, 0);
-    d_strides.set(3, data_type_size * output_matrix_stride);
-
-    TensorInfo a_info{};
-    TensorInfo b_info{};
-    TensorInfo d_info{};
-    a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
-    b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
-    d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
-
-    _input_transformed  = a_info;
-    _kernel_storage     = b_info;
-    _output_transformed = d_info;
-
-    const ITensorInfo *input_to_use  = src;
-    ITensorInfo       *output_to_use = dst;
-    PermutationVector  weights_permutation_vector(3U, 0U, 1U, 2U);
-    const unsigned int max_num_threads = NEScheduler::get().num_threads();
-
-    // Configure the kernel to transform the input tensor from NCHW -> NHWC
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-        input_to_use               = &_input_nhwc;
-        weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
-    }
-
-    // Configure input transform kernel
-    transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                      &_input_transformed, input_matrix_stride, &_input_workspace);
-    const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
-    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
-    _input_workspace = input_workspace_info;
-
-    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-    _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
-    transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-    // Configure GEMM function
-    _gemm_function->configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
-
-    // Configure output transform function
-    // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-    if(_data_layout == DataLayout::NCHW)
-    {
-        // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-        TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
-                                    dst->dimension(1), dst->dimension(3)),
-                        1, dst->data_type());
-        _output_nhwc  = info;
-        output_to_use = &_output_nhwc;
-    }
-    const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
-
-    transform_output_kernel->configure(biases,
-                                       &_output_transformed,
-                                       output_matrix_stride,
-                                       output_to_use,
-                                       in_shape.n_batches,
-                                       output_shape.first,
-                                       output_shape.second,
-                                       out_channels,
-                                       &_output_workspace,
-                                       activation);
-
-    const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
-    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
-    _output_workspace = output_workspace_info;
-
-    // Reorder the convoluted output to ACL's ordering NCHW
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
-    }
-
-    _transform_input_kernel   = std::move(transform_input_kernel);
-    _transform_weights_kernel = std::move(transform_weights_kernel);
-    _transform_output_kernel  = std::move(transform_output_kernel);
-
-    //Configure Activation Layer
-    _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
-    if(_run_activation)
-    {
-        _activation_func->configure(dst, nullptr, act_info);
-    }
-
-    auto asm_mem_req         = _gemm_function->workspace();
-    _aux_mem[GemmWorkspace]  = asm_mem_req[GemmWorkspace];
-    _aux_mem[Pretranspose]   = asm_mem_req[Pretranspose];
-    _aux_mem[InterleavedLHS] = asm_mem_req[InterleavedLHS];
-    _aux_mem[TransposedRHS]  = asm_mem_req[TransposedRHS];
-    _aux_mem[TempResult]     = asm_mem_req[TempResult];
-
-    // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
-    _aux_mem[TransformedInput]   = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, input_storage_size, storage_alignment);
-    _aux_mem[TransformedOutput]  = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, output_storage_size, storage_alignment);
-    _aux_mem[WorkspaceIO]        = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
-    _aux_mem[PermutedWeights]    = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
-    _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment);
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
-        _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
-    }
-}
-
-Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
-
-    // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-
-    // Input shape, kernel size and output tile
-    const Size2D   input_dims  = Size2D(src->dimension(idx_width), src->dimension(idx_height));
-    const Size2D   kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
-    const DataType data_type   = src->data_type();
-    const Size2D   output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
-                                        "This Winograd configuration requires enable_fast_math=true");
-    }
-
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
-
-    // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-    const TensorInfo  input0       = src->clone()->set_tensor_shape(input0_shape);
-    // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
-    // Validate batched matrix multiply
-    TensorShape batched_mm_output_shape = input0.tensor_shape();
-    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
-    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-
-    if(kernel_size == Size2D(3, 3))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        return validate_kernel_3x3(input_dims, src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(5, 5))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        return validate_kernel_5x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    if(kernel_size == Size2D(3, 1))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_3x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(1, 3))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_1x3(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(5, 1))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_5x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(1, 5))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_1x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(7, 1))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_7x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(1, 7))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_1x7(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
-    }
-}
-
-void CpuWinogradConv2d::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto a = tensors.get_const_tensor(ACL_SRC_0);
-    auto c = tensors.get_const_tensor(ACL_SRC_2);
-    auto d = tensors.get_tensor(ACL_DST);
-
-    CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
-    CpuAuxTensorHandler input_transformed(offset_int_vec(TransformedInput), _input_transformed, tensors, true);
-    CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
-
-    const bool is_nchw = _data_layout == DataLayout::NCHW;
-    if(is_nchw)
-    {
-        //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-        ITensorPack pack{ { ACL_SRC, a }, { ACL_DST, input_nhwc.get() } };
-        _permute_input->run(pack);
-    }
-
-    // Transform input tensor to the winograd domain
-    ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : a }, { ACL_DST, input_transformed.get() }, { ACL_INT, input_workspace.get() } };
-    NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, _transform_input_kernel->window(), transform_input_pack);
-
-    CpuAuxTensorHandler output_transformed(offset_int_vec(TransformedOutput), _output_transformed, tensors, true);
-    CpuAuxTensorHandler weights_transformed(offset_int_vec(TransformedWeights), _kernel_storage, tensors, true);
-
-    // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    ITensorPack gemm_pack = tensors;
-    gemm_pack.add_const_tensor(ACL_SRC, input_transformed.get());
-    gemm_pack.add_const_tensor(ACL_SRC_1, weights_transformed.get());
-    gemm_pack.add_const_tensor(ACL_BIAS, nullptr);
-    gemm_pack.add_tensor(ACL_DST, output_transformed.get());
-    _gemm_function->run(gemm_pack);
-
-    // Transform output tensor to the spatial domain
-    CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
-    CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
-    ITensorPack         transform_output_pack{ { ACL_SRC_0, c }, { ACL_SRC_1, output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : d }, { ACL_INT, output_workspace.get() } };
-    NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, _transform_output_kernel->window(), transform_output_pack);
-
-    if(is_nchw)
-    {
-        // Reorder the convoluted output to ACL's ordering NCHW
-        ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, d } };
-        _permute_output->run(pack);
-    }
-
-    if(_run_activation)
-    {
-        ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
-        _activation_func->run(pack);
-    }
-}
-
-void CpuWinogradConv2d::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        // Permute weights
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
-        ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
-
-        CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
-        _permute_weights->run(permute_tensors);
-
-        // Transform weights
-        ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
-        ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
-
-        CpuAuxTensorHandler transformed_weights(_kernel_storage, *weights_transf);
-        ITensorPack         transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } };
-        NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors);
-
-        ITensorPack gemm_pack = tensors;
-        gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get());
-        _gemm_function->prepare(gemm_pack);
-
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements CpuWinogradConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.h b/src/runtime/cpu/operators/CpuWinogradConv2d.h
deleted file mode 100644
index b5b9c3f2e3..0000000000
--- a/src/runtime/cpu/operators/CpuWinogradConv2d.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H
-#define ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
-#include "src/runtime/cpu/operators/CpuGemm.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-class CpuWinogradConv2d : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuWinogradConv2d();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWinogradConv2d);
-    /** Destructor */
-    ~CpuWinogradConv2d();
-
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1           |src2   |dst            |
-     * |:--------------|:--------------|:------|:--------------|
-     * |F16            |F16            |F16    |F16            |
-     * |F32            |F32            |F32    |F32            |
-     *
-     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs.
-     *                              Data types supported: F16/F32.
-     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     *                              Currently only 3x3 and 5x5 kernels are supported.
-     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                              Data types supported: Same as @p input.
-     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
-                   bool                       enable_fast_math = false);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
-     *
-     * Similar to CpuWinogradConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
-                           bool                       enable_fast_math = false);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum AuxTensorIdx
-    {
-        GemmWorkspace      = 0,
-        Pretranspose       = 1,
-        InterleavedLHS     = 2,
-        TransposedRHS      = 3,
-        TempResult         = 4,
-        TransformedInput   = 5,
-        TransformedOutput  = 6,
-        WorkspaceIO        = 7,
-        TransformedWeights = 8,
-        PermutedWeights    = 9,
-        PermutedInput      = TransformedOutput,
-        PermutedOutput     = TransformedInput,
-        Count              = 10
-    };
-
-    std::unique_ptr<CpuGemm>       _gemm_function;
-    std::unique_ptr<CpuActivation> _activation_func;
-    std::unique_ptr<CpuPermute>    _permute_input;
-    std::unique_ptr<CpuPermute>    _permute_output;
-    std::unique_ptr<CpuPermute>    _permute_weights;
-    std::unique_ptr<ICPPKernel>    _transform_input_kernel;
-    std::unique_ptr<ICPPKernel>    _transform_weights_kernel;
-    std::unique_ptr<ICPPKernel>    _transform_output_kernel;
-
-    DataLayout                       _data_layout;
-    experimental::MemoryRequirements _aux_mem{ Count };
-    TensorInfo                       _input_nhwc;
-    TensorInfo                       _output_nhwc;
-    TensorInfo                       _input_workspace;
-    TensorInfo                       _kernel_storage;
-    TensorInfo                       _output_workspace;
-    TensorInfo                       _input_transformed;
-    TensorInfo                       _output_transformed;
-    TensorInfo                       _weights_hwio;
-    bool                             _run_activation;
-    bool                             _is_prepared;
-};
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H */
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
deleted file mode 100644
index 9786161dee..0000000000
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ /dev/null
@@ -1,721 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
-#include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/utils/AssemblyUtils.h"
-#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::experimental;
-
-namespace
-{
-struct free_delete
-{
-    void operator()(void *x)
-    {
-        free(x);
-    }
-};
-
-struct Params
-{
-    unsigned int M;
-    unsigned int N;
-    unsigned int K;
-    unsigned int batches;
-    unsigned int multis;
-    unsigned int sections;
-    bool         indirect;
-};
-
-Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    Params p;
-    p.M        = d->tensor_shape().y();
-    p.K        = a->tensor_shape().x();
-    p.N        = d->tensor_shape().x();
-    p.batches  = 1;
-    p.multis   = 1;
-    p.sections = 1;
-    p.indirect = false;
-
-    if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
-    {
-        p.indirect = true;
-        p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
-    }
-    else
-    {
-        p.multis  = b->tensor_shape().z();
-        p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
-    }
-
-    // Update M in case of GEMM3D for output
-    if(info.depth_output_gemm3d != 0)
-    {
-        p.M       = d->tensor_shape().y() * d->tensor_shape().z();
-        p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
-    }
-
-    return p;
-}
-
-IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
-{
-    // Schedule assembly kernel
-    const int         granule_threshold = 200;
-    IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
-    if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
-    {
-        scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
-    }
-    else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
-    {
-        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
-    else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
-    {
-        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
-
-    return scheduling_hint;
-}
-
-/** Fallback in case ACL doesn't have a function */
-template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
-class Fallback : public CpuGemmAssemblyDispatch::IFallback
-{
-public:
-    /** Destructor */
-    ~Fallback() = default;
-
-    /** Initialise the functions's input and output.
-     *
-     * @param[in]  a         Input tensor containing the Matrix A.
-     * @param[in]  b         Input tensor containing the Matrix B.
-     * @param[in]  c         Input tensor containing the Matrix C.
-     * @param[out] d         Output tensor to store the result of matrix multiplication.
-     * @param[in]  args      Matrix multiplication information.
-     * @param[in]  gemm_info GEMM meta-data
-     * @param[in]  os        Output stage meta-data.
-     */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
-                   const OutputStage &os = {});
-
-    /** Set requantization shifts to be used
-     *
-     * @param[in] shifts Requantization shifts
-     *
-     * @return Pointer to the shift data
-     */
-    /** Set requantization data to be used
-      *
-      *
-      * @param shifts       Requantization shifts
-      * @param multipliers  Requantization multipliers
-      *
-      * @return A tuple with the pointers to the shift and multiplier data respectively
-      */
-    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                            const std::vector<int32_t> &multipliers);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    bool                             is_configured() const override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum AuxTensorIdx
-    {
-        AsmGemmWorkspace = 0,
-        Pretranspose,
-        Count
-    };
-
-    /** Configure the indirect buffer
-     *
-     * @param[in]  a    Input tensor containing the Matrix A.
-     * @param[in]  b    Input tensor containing the Matrix B.
-     * @param[out] d    Output tensor to store the result of matrix multiplication.
-     * @param[in]  info GEMM meta-data
-     */
-    void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
-    /** Prepare the indirect buffer */
-    void prepare_indirect_buffer(ITensorPack &tensors);
-
-    /** Assembly Gemm kernel */
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
-    /** Optimised Arm® Neon™ kernel */
-    std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
-    /** Assembly GEMM workspace tensor info */
-    TensorInfo _workspace_info{};
-    /** Pre-transpose tensor info */
-    TensorInfo _pretranspose_info{};
-    /** Prepared flag */
-    bool _is_prepared{ false };
-    /** GEMM meta-data */
-    AsmGemmInfo _gemm_info{};
-    /** GEMM kernel description */
-    arm_gemm::KernelDescription _kernel_info{};
-    /** Per channel quantization shifts */
-    std::vector<int32_t> _shifts{};
-    std::vector<int32_t> right_shifts{};
-    std::vector<int32_t> left_shifts{};
-    /** Per channel quantization multipliers */
-    std::vector<int32_t> _multipliers{};
-    /** Indirect buffer */
-    std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
-    std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
-    std::vector<TypeInput>           _indirect_pad{};
-    arm_gemm::ConvolutionParameters  _cp{};
-    experimental::MemoryRequirements _aux_mem{ Count };
-};
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
-{
-    _multipliers   = multipliers;
-    _shifts        = shifts;
-    bool need_left = false;
-    for(const auto s : _shifts)
-    {
-        left_shifts.push_back(std::max(-s, int32_t(0)));
-        right_shifts.push_back(std::min(-s, int32_t(0)));
-        if(s < 0 && !need_left)
-        {
-            need_left = true;
-        }
-    }
-    return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
-{
-    auto             a              = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const TypeInput *A_ptr          = reinterpret_cast<TypeInput *>(a->buffer());
-    const int        multis         = 1;
-    const int        batches        = a->info()->tensor_shape().total_size_upper(3);
-    const size_t     stride_A       = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
-    const size_t     batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
-    const size_t     multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
-
-    const size_t output_hw    = _cp.output_height * _cp.output_width;
-    const int    batch_size   = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
-    const size_t batch_stride = batch_size / sizeof(TypeInput);
-    const int    multi_size   = batch_size * batches;
-    const size_t multi_stride = multi_size / sizeof(TypeInput);
-
-    for(int64_t m = 0; m < multis; m++)
-    {
-        for(int64_t b = 0; b < batches; b++)
-        {
-            for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
-            {
-                for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
-                {
-                    int64_t output_xy = (output_y * _cp.output_width) + output_x;
-
-                    for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
-                    {
-                        for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
-                        {
-                            int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
-                            int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
-                            int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
-                            int64_t input_xy  = (input_y * _cp.input_width) + input_x;
-
-                            if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
-                            {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
-                            }
-                            else
-                            {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
-                                    A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
-
-    float zeropad = 0.f;
-    if(is_data_type_quantized(a->data_type()))
-    {
-        zeropad = a->quantization_info().uniform().offset;
-    }
-
-    const int64_t input_width    = static_cast<int64_t>(a->tensor_shape()[1]);
-    const int64_t input_height   = static_cast<int64_t>(a->tensor_shape()[2]);
-    const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
-    const int64_t kernel_width   = static_cast<int64_t>(b->tensor_shape()[2]);
-    const int64_t kernel_height  = static_cast<int64_t>(b->tensor_shape()[3]);
-    const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
-    const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
-
-    _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
-            info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
-          };
-
-    if(info.method == AsmConvMethod::Conv)
-    {
-        _gemm_kernel_asm->set_convolution_parameters(_cp);
-    }
-
-    if(info.method == AsmConvMethod::Indirect)
-    {
-        const unsigned int multis    = 1;
-        const unsigned int batches   = a->tensor_shape().total_size_upper(3);
-        const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
-        const unsigned int output_hw = _cp.output_width * _cp.output_height;
-
-        using TypeInputPtr        = TypeInput *;
-        const int    batch_size   = kernel_hw * output_hw * sizeof(TypeInputPtr);
-        const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
-        const int    multi_size   = batch_size * batches;
-        const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
-
-        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
-        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
-        _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
-
-        // Set indirect argument
-        int64_t pos = 0;
-        for(int64_t m = 0; m < multis; m++)
-        {
-            for(int64_t b = 0; b < batches; b++)
-            {
-                for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
-                {
-                    (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
-                }
-            }
-        }
-
-        _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
-    }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                                                             arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
-                                                             const OutputStage &os)
-{
-    ARM_COMPUTE_UNUSED(c);
-    arm_gemm::GemmConfig gemm_cfg;
-    _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
-    {
-        gemm_cfg.filter = _kernel_info.name;
-        args._cfg       = &gemm_cfg;
-    }
-    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(_gemm_kernel_asm == nullptr)
-    {
-        //configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    // arm_compute wrapper for the Gemm object (see above)
-    auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
-    ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
-    acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
-    const size_t       workspace_size = _gemm_kernel_asm->get_working_size();
-    const unsigned int alignment      = 4096;
-    _workspace_info                   = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
-    _aux_mem[AsmGemmWorkspace]        = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
-
-    //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
-    //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
-    {
-        const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
-        if(window_size < static_cast<unsigned int>(args._maxthreads))
-        {
-            _gemm_kernel_asm->set_nthreads(window_size);
-        }
-    }
-
-    _optimised_kernel = std::move(acl_gemm_wrapper);
-    _gemm_info        = gemm_info;
-    // Check for pre-transposed support
-    if(_gemm_kernel_asm->B_pretranspose_required())
-    {
-        // Forcing 128-byte alignment (required by 32-bit kernels)
-        const unsigned int alignment           = 128;
-        const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
-        _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
-        _aux_mem[Pretranspose]                 = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
-    }
-
-    // Handle indirect GEMM convolution
-    if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
-    {
-        configure_indirect(a, b, d, gemm_info);
-    }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-
-        // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-        if(c && c->info()->data_type() == DataType::S32)
-        {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
-        }
-
-        // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
-        {
-            const int  ldb            = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
-            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
-
-            CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
-            ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
-            _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
-
-            b->mark_as_unused();
-        }
-
-        if(_gemm_info.method == AsmConvMethod::Indirect)
-        {
-            prepare_indirect_buffer(tensors);
-        }
-
-        _is_prepared = true;
-    }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
-{
-    return _optimised_kernel != nullptr;
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const
-{
-    return _aux_mem;
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
-{
-    auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto d = tensors.get_tensor(TensorType::ACL_DST);
-
-    int       lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
-    int       ldb = 0;
-    const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
-
-    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
-    const size_t a_multi_idx = a_batch_idx + 1;
-    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
-    const size_t d_multi_idx = d_batch_idx + 1;
-
-    int       batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
-    const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
-
-    int       multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
-    int       multi_stride_b = 0;
-    const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
-
-    auto             in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
-    const TypeInput *in1_ptr = nullptr;
-    auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
-
-    // Check if B is pre-tranposed and de-reference if not
-    if(!_gemm_kernel_asm->B_is_pretransposed())
-    {
-        ldb            = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
-        multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
-        in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-    }
-
-    const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
-
-    // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
-    CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
-    if(workspace.get()->buffer() != nullptr)
-    {
-        _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
-        const unsigned int split_dim   = scheduling_hint.split_dimension();
-        const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
-        unsigned int       num_threads = NEScheduler::get().num_threads();
-        if(window_size < num_threads)
-        {
-            num_threads = window_size;
-        }
-        if(split_dim != IScheduler::split_dimensions_all)
-        {
-            // Make sure the kernel does not expect more threads than we can actually spawn
-            const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
-            num_threads                       = std::min(num_iterations, num_threads);
-        }
-        _gemm_kernel_asm->set_nthreads(num_threads);
-    }
-
-    // Prepare assembly kernel
-    prepare(tensors);
-
-    // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-    TypeOutput *bias = nullptr;
-    if(c && c->info()->data_type() != DataType::S32)
-    {
-        bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
-    }
-
-    if(_gemm_info.method == AsmConvMethod::Indirect)
-    {
-        in0_ptr        = nullptr;
-        lda            = 0;
-        batch_stride_a = 0;
-        multi_stride_a = 0;
-    }
-
-    // Set gemm parameters
-    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
-                                 in1_ptr, ldb, multi_stride_b,
-                                 out_ptr, ldd, batch_stride_d, multi_stride_d,
-                                 bias, 0);
-    // Schedule
-    NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                     arm_gemm::Activation activation, const AsmGemmInfo &info)
-{
-    Params         p           = extract_parameters(a, b, d, info);
-    const CPUInfo &ci          = NEScheduler::get().cpu_info();
-    unsigned int   num_threads = NEScheduler::get().num_threads();
-
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
-
-    // Create arm_gemm fallback
-    auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
-    fallback->configure(a, b, c, d, args, info);
-    arm_gemm = std::move(fallback);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                           arm_gemm::Activation activation, const AsmGemmInfo &info)
-{
-    ARM_COMPUTE_UNUSED(activation);
-    Params             p           = extract_parameters(a, b, d, info);
-    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-    const unsigned int num_threads = NEScheduler::get().num_threads();
-
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
-
-    // Create arm_gemm fallback
-    auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
-
-    // Configure requantization info
-    const int32_t                 negation = info.negated_offsets ? 1 : -1;
-    const int32_t                 a_offset = -a->quantization_info().uniform().offset * negation;
-    const int32_t                 b_offset = -b->quantization_info().uniform().offset * negation;
-    const GEMMLowpOutputStageInfo os_info  = info.output_stage;
-
-    arm_gemm::Requantize32 gemm_requant_info{};
-    if(os_info.gemmlowp_shifts.size() > 1)
-    {
-        const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
-        gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
-                                                            a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
-                                                            std::get<2>(requantize_data),
-                                                            std::get<3>(requantize_data),
-                                                            os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
-    }
-    else
-    {
-        gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
-                                                   a_offset, b_offset, os_info.gemmlowp_offset,
-                                                   -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
-                                                   os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
-    }
-
-    // Configure fallback
-    fallback->configure(a, b, c, d, args, info, gemm_requant_info);
-    arm_gemm = std::move(fallback);
-}
-} //namespace
-
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
-    : _arm_gemm(nullptr)
-{
-}
-
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
-{
-    ARM_COMPUTE_UNUSED(c, info);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-
-#ifndef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
-#endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(is_data_type_quantized_per_channel(b->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
-    return Status{};
-}
-
-bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
-{
-    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
-    return act.type != arm_gemm::Activation::Type::None;
-}
-
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
-
-    //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
-    {
-        return;
-    }
-
-    switch(a->data_type())
-    {
-        case DataType::F32:
-            create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
-            break;
-#ifdef __aarch64__
-        case DataType::U8:
-        case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
-            {
-                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
-            }
-            else
-            {
-                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info);
-            }
-            break;
-        case DataType::S8:
-        case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
-            {
-                create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
-            }
-            else
-            {
-                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);
-            }
-            break;
-#endif /* __aarch64__ */
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-        case DataType::BFLOAT16:
-            create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info);
-            break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
-}
-
-void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    _arm_gemm->prepare(tensors);
-}
-
-bool CpuGemmAssemblyDispatch::is_configured() const
-{
-    return _arm_gemm != nullptr && _arm_gemm->is_configured();
-}
-
-void CpuGemmAssemblyDispatch::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    _arm_gemm->run(tensors);
-}
-
-experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const
-{
-    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    return _arm_gemm->workspace();
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
deleted file mode 100644
index 88cfed002a..0000000000
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
-#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
-
-#include "src/core/common/Macros.h"
-#include "src/runtime/cpu/ICpuOperator.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/* Convolution method supported by the assembly gemm interface */
-enum class AsmConvMethod
-{
-    Im2Col,
-    Indirect,
-    Conv
-};
-
-struct AsmGemmInfo
-{
-    AsmConvMethod           method{ AsmConvMethod::Im2Col };
-    PadStrideInfo           ps_info{};
-    ActivationLayerInfo     activation_info{};
-    GEMMLowpOutputStageInfo output_stage{};
-    bool                    negated_offsets{ true };
-    bool                    reinterpret_input_as_3d{ false };
-    bool                    depth_output_gemm3d{ false };
-    int64_t                 padding_top{ 0 };
-    int64_t                 padding_left{ 0 };
-    float                   padding_value{ 0.f };
-    bool                    fast_mode{ false };
-};
-
-/** Assembly kernel glue */
-class CpuGemmAssemblyDispatch : public ICpuOperator
-{
-public:
-    /** Constructor */
-    CpuGemmAssemblyDispatch();
-    /** Defautl destructor */
-    ~CpuGemmAssemblyDispatch() = default;
-
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch);
-
-    class IFallback
-    {
-    public:
-        virtual void run(ITensorPack &tensors)                         = 0;
-        virtual void prepare(ITensorPack &tensors)                     = 0;
-        virtual experimental::MemoryRequirements workspace() const     = 0;
-        virtual bool                             is_configured() const = 0;
-        virtual ~IFallback()                                           = default;
-    };
-
-public:
-    /** If supported create a Compute Library function else fallback to the arm_gemm function.
-     *
-     * @param[in]  a    Input tensor (Matrix A)
-     * @param[in]  b    Input tensor (Matrix B)
-     * @param[in]  c    Input tensor (Matrix C) used to pass the bias for quantized calculations
-     * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  info GEMM meta-data
-     */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * @param[in] a    Input tensor info (Matrix A)
-     * @param[in] b    Input tensor info (Matrix B)
-     * @param[in] c    Input tensor info (Matrix C) used to pass the bias for quantized calculations
-     * @param[in] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] info GEMM meta-data
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
-    /** Checks if activation is supported by the gemm assembly dispatcher
-     *
-     * @param[in] activation Activation to check
-     *
-     * @return True if activation is supported else false
-     */
-    static bool is_activation_supported(const ActivationLayerInfo &activation);
-    /** Was the function successfully configured ?
-     *
-     * @return True if the function is configured and ready to run
-     */
-    bool is_configured() const;
-
-    // Inherited methods overridden:
-    void prepare(ITensorPack &tensors) override;
-    void run(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */
diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h
deleted file mode 100644
index ae1cffb659..0000000000
--- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
-
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include "src/common/utils/Log.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/* Tensor handler to wrap and handle tensor allocations on workspace buffers */
-class CpuAuxTensorHandler
-{
-public:
-    CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
-        : _tensor()
-    {
-        if(info.total_size() == 0)
-        {
-            return;
-        }
-        _tensor.allocator()->soft_init(info);
-
-        ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
-        {
-            if(!bypass_alloc)
-            {
-                _tensor.allocator()->allocate();
-                ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
-            }
-
-            if(pack_inject)
-            {
-                pack.add_tensor(slot_id, &_tensor);
-                _injected_tensor_pack = &pack;
-                _injected_slot_id     = slot_id;
-            }
-        }
-        else
-        {
-            _tensor.allocator()->import_memory(packed_tensor->buffer());
-        }
-    }
-
-    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor)
-        : _tensor()
-    {
-        _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
-        {
-            _tensor.allocator()->import_memory(tensor.buffer());
-        }
-    }
-
-    CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
-    CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete;
-
-    ~CpuAuxTensorHandler()
-    {
-        if(_injected_tensor_pack)
-        {
-            _injected_tensor_pack->remove_tensor(_injected_slot_id);
-        }
-    }
-
-    ITensor *get()
-    {
-        return &_tensor;
-    }
-
-    ITensor *operator()()
-    {
-        return &_tensor;
-    }
-
-private:
-    Tensor       _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
-};
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/IClOperator.h b/src/runtime/gpu/cl/IClOperator.h
deleted file mode 100644
index 049bf05dc1..0000000000
--- a/src/runtime/gpu/cl/IClOperator.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICL_OPERATOR_H
-#define ARM_COMPUTE_ICL_OPERATOR_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/runtime/CL/ICLOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using IClOperator = experimental::ICLOperator;
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICL_OPERATOR_H */
diff --git a/src/runtime/gpu/cl/operators/ClActivation.cpp b/src/runtime/gpu/cl/operators/ClActivation.cpp
deleted file mode 100644
index 34a2f94fdc..0000000000
--- a/src/runtime/gpu/cl/operators/ClActivation.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClActivation.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-
-#include "src/common/IOperator.h"
-#include "src/common/utils/LegacySupport.h"
-#include "src/gpu/cl/ClContext.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClActivationKernel>();
-    k->configure(compile_context, src, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClActivationKernel::validate(src, dst, act_info);
-}
-} // namespace opencl
-
-namespace gpu
-{
-namespace opencl
-{
-std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
-{
-    TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
-    TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
-    auto       info     = detail::convert_to_activation_info(act);
-
-    if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
-    {
-        return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
-    }
-
-    auto act_op = std::make_unique<arm_compute::opencl::ClActivation>();
-    act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
-
-    auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
-    {
-        ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
-        return std::make_tuple(nullptr, StatusCode::OutOfMemory);
-    }
-    op->set_internal_operator(std::move(act_op));
-
-    return std::make_tuple(op, StatusCode::Success);
-}
-} // namespace opencl
-} // namespace gpu
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClActivation.h b/src/runtime/gpu/cl/operators/ClActivation.h
deleted file mode 100644
index 82ef8ac63a..0000000000
--- a/src/runtime/gpu/cl/operators/ClActivation.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ACTIVATION_H
-#define ARM_COMPUTE_CL_ACTIVATION_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClActivationKernel */
-class ClActivation : public IClOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]  activation_info Activation layer parameters.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClActivation::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ACTIVATION_H */
diff --git a/src/runtime/gpu/cl/operators/ClAdd.cpp b/src/runtime/gpu/cl/operators/ClAdd.cpp
deleted file mode 100644
index 01f550f819..0000000000
--- a/src/runtime/gpu/cl/operators/ClAdd.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClAdd.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClAdd.h b/src/runtime/gpu/cl/operators/ClAdd.h
deleted file mode 100644
index 7b84a767d6..0000000000
--- a/src/runtime/gpu/cl/operators/ClAdd.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ADD_H
-#define ARM_COMPUTE_CL_ADD_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run arithmetic addition
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @note The function performs an arithmetic addition between two tensors.
- */
-class ClAdd : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * Valid configurations (src1,src2) -> dst :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
-     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2            Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
-     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst             Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
-     * @param[in]      policy          Policy to use to handle overflow.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClAdd::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ADD_H */
diff --git a/src/runtime/gpu/cl/operators/ClCast.cpp b/src/runtime/gpu/cl/operators/ClCast.cpp
deleted file mode 100644
index 3f54004aa7..0000000000
--- a/src/runtime/gpu/cl/operators/ClCast.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClCast.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClCastKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
-    auto k = std::make_unique<kernels::ClCastKernel>();
-    k->configure(compile_context, src, dst, policy);
-    _kernel = std::move(k);
-}
-
-Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    return kernels::ClCastKernel::validate(src, dst, policy);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClCast.h b/src/runtime/gpu/cl/operators/ClCast.h
deleted file mode 100644
index 107eb2bfe9..0000000000
--- a/src/runtime/gpu/cl/operators/ClCast.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CAST_H
-#define ARM_COMPUTE_CL_CAST_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClCastKernel */
-class ClCast : public IClOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @note Input data type must be different than output data type.
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src            |dst                                    |
-     * |:--------------|:--------------------------------------|
-     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
-     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
-     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
-     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
-     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
-     * |F16            | U8, S8, U16, S16, U32, F32            |
-     * |F32            | U8, S8, U16, S16, U32, F16            |
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCast::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CAST_H */
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.cpp b/src/runtime/gpu/cl/operators/ClConcatenate.cpp
deleted file mode 100644
index d3c05eae78..0000000000
--- a/src/runtime/gpu/cl/operators/ClConcatenate.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
-#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
-#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
-#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
-{
-    ARM_COMPUTE_ERROR_ON(dst == nullptr);
-    _axis       = axis;
-    _num_inputs = src_vector.size();
-
-    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
-    std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
-    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
-
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
-
-    unsigned int offset = 0;
-    switch(_axis)
-    {
-        case Window::DimX:
-        {
-            switch(_num_inputs)
-            {
-                case 2:
-                {
-                    // Configure WidthConcatenate2Tensors kernel
-                    auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
-                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                case 4:
-                {
-                    // Configure WidthConcatenate4Tensors kernel
-                    auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                default:
-                {
-                    // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
-                    {
-                        auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
-                        kernel->configure(compile_context, src_vector.at(i), offset, dst);
-                        offset += src_vector.at(i)->dimension(_axis);
-                        _concat_kernels.emplace_back(std::move(kernel));
-                    }
-                    break;
-                }
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
-                kernel->configure(compile_context, src_vector.at(i), offset, dst);
-                offset += src_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
-                kernel->configure(compile_context, src_vector.at(i), offset, dst);
-                offset += src_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case 3:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
-                kernel->configure(compile_context, src_vector.at(i), offset, dst);
-                offset += src_vector.at(i)->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-}
-
-Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
-    const unsigned int num_inputs = src_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    unsigned int offset = 0;
-    switch(axis)
-    {
-        case Window::DimX:
-        {
-            switch(num_inputs)
-            {
-                case 2:
-                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
-                    break;
-                case 4:
-                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
-                    break;
-                default:
-                    // Validate generic case of WidthConcatenate kernel
-                    for(const auto &src : src_vector)
-                    {
-                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-                        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
-                        offset += src->dimension(axis);
-                    }
-                    break;
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(const auto &src : src_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
-                offset += src->dimension(axis);
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(const auto &src : src_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
-                offset += src->dimension(axis);
-            }
-            break;
-        }
-        case 3:
-        {
-            for(const auto &src : src_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
-                offset += src->dimension(axis);
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-
-    if(dst->total_size() != 0)
-    {
-        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-
-void ClConcatenate::run(ITensorPack &tensors)
-{
-    if(tensors.empty())
-    {
-        ARM_COMPUTE_ERROR("No inputs provided");
-    }
-
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
-    {
-        ARM_COMPUTE_ERROR("Configured with different number of inputs");
-    }
-
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
-    {
-        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
-        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
-    }
-    else
-    {
-        int i = 0;
-        for(auto &k : _concat_kernels)
-        {
-            ITensorPack pack;
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
-            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
-            CLScheduler::get().enqueue_op(*k, pack, true);
-            ++i;
-        }
-    }
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClConcatenate.h b/src/runtime/gpu/cl/operators/ClConcatenate.h
deleted file mode 100644
index 153400bd73..0000000000
--- a/src/runtime/gpu/cl/operators/ClConcatenate.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONCATENATE_H
-#define ARM_COMPUTE_CLCONCATENATE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
- * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
- * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
- * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
- */
-class ClConcatenate : public IClOperator
-{
-public:
-    ClConcatenate() = default;
-    /** Initialise the kernel's inputs vector and dst.
-     *
-     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
-     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
-     *
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] src_vector      The vectors containing all the tensors info to concatenate. Data types supported: All
-     * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
-     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
-     */
-    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClConcatenate::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
-    unsigned int                            _num_inputs{ 0 };
-    unsigned int                            _axis{ 0 };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
diff --git a/src/runtime/gpu/cl/operators/ClConv2d.cpp b/src/runtime/gpu/cl/operators/ClConv2d.cpp
deleted file mode 100644
index 0cb3a968e6..0000000000
--- a/src/runtime/gpu/cl/operators/ClConv2d.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClConv2d.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
-#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
-#include "src/runtime/gpu/cl/operators/ClGemmConv2d.h"
-#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h"
-
-#include <memory>
-
-namespace
-{
-/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
- *
- * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
- *
- * @param[in] gpu_target GPU target
- *
- * @return the suitable kernel size for using direct convolution method with NHWC data layout
- */
-size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
-{
-    switch(gpu_target)
-    {
-        case arm_compute::GPUTarget::G76:
-        case arm_compute::GPUTarget::G77:
-        case arm_compute::GPUTarget::G78:
-            return 5;
-        case arm_compute::GPUTarget::G71:
-        case arm_compute::GPUTarget::G72:
-        case arm_compute::GPUTarget::MIDGARD:
-        case arm_compute::GPUTarget::BIFROST:
-            return 7;
-        default:
-            return 5;
-    }
-}
-} // namespace
-
-namespace arm_compute
-{
-namespace opencl
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClConv2d::ClConv2d()
-    : _operator()
-{
-}
-
-ClConv2d::~ClConv2d() = default;
-
-void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                         const WeightsInfo &weights_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
-
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
-    {
-        case ConvolutionMethod::WINOGRAD:
-        {
-            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            auto f = std::make_unique<ClWinogradConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
-            _operator = std::move(f);
-            break;
-        }
-        case ConvolutionMethod::DIRECT:
-        {
-            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
-            auto f = std::make_unique<ClDirectConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
-            _operator = std::move(f);
-            break;
-        }
-        case ConvolutionMethod::GEMM:
-        {
-            auto f = std::make_unique<ClGemmConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info, weights_info);
-            _operator = std::move(f);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-    }
-    _aux_mem = _operator->workspace();
-}
-
-Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                          const WeightsInfo &weights_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
-    {
-        case ConvolutionMethod::WINOGRAD:
-        {
-            //Validate Winograd
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
-            break;
-        }
-        case ConvolutionMethod::DIRECT:
-        {
-            // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
-            break;
-        }
-        case ConvolutionMethod::GEMM:
-        {
-            // Validate gemm-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-    }
-
-    return Status{};
-}
-
-ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                   const WeightsInfo &weights_info, const GPUTarget gpu_target)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_UNUSED(weights_info);
-
-    const PadStrideInfo       conv_info        = conv2d_info.conv_info;
-    const ActivationLayerInfo act_info         = conv2d_info.act_info;
-    const Size2D              dilation         = conv2d_info.dilation;
-    bool                      enable_fast_math = conv2d_info.enable_fast_math;
-
-    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
-
-    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
-    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
-    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
-
-    const std::vector<ConfigurationMethod> known_configs =
-    {
-        // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
-        // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
-        // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
-        // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
-        // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
-        // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
-    };
-
-    const auto find_config = [&](ConfigurationMethod c)
-    {
-        const ConvolutionConfiguration config      = c.first;
-        const PadStrideInfo            info        = std::get<3>(config);
-        const DataLayout               data_layout = std::get<4>(config);
-
-        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout());
-    };
-
-    std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
-    {
-        return (*found).second;
-    }
-
-    if(dilation != Size2D(1U, 1U))
-    {
-        return ConvolutionMethod::GEMM;
-    }
-    else
-    {
-        if(src->data_layout() == DataLayout::NCHW)
-        {
-            // SRGAN
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
-            {
-                return ConvolutionMethod::DIRECT;
-            }
-            if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
-            {
-                return ConvolutionMethod::FFT;
-            }
-            if(src->dimension(idx_c) < 16)
-            {
-                return ConvolutionMethod::GEMM;
-            }
-            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
-        }
-        else
-        {
-            const bool   is_direct_valid           = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
-            const bool   is_wino_valid             = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
-            const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
-
-            // SRGAN case
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && is_direct_valid)
-            {
-                return ConvolutionMethod::DIRECT;
-            }
-
-            // Floating-point case: GeMM/Direct/Winograd
-            if(is_data_type_float(src->data_type()))
-            {
-                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-
-                // Run Winograd if valid and IFM >= 16
-                if(is_wino_valid && is_ifm_ge_16)
-                {
-                    return ConvolutionMethod::WINOGRAD;
-                }
-                // Run Direct for Large kernel size
-                if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm)
-                {
-                    return ConvolutionMethod::DIRECT;
-                }
-
-                // Default case
-                return ConvolutionMethod::GEMM;
-            }
-
-            // Generic case for quantized. Only GeMM
-            return ConvolutionMethod::GEMM;
-        }
-    }
-}
-
-void ClConv2d::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-    _operator->run(tensors);
-}
-
-void ClConv2d::prepare(ITensorPack &tensors)
-{
-    _operator->prepare(tensors);
-}
-
-experimental::MemoryRequirements ClConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClConv2d.h b/src/runtime/gpu/cl/operators/ClConv2d.h
deleted file mode 100644
index cdf3b7df32..0000000000
--- a/src/runtime/gpu/cl/operators/ClConv2d.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONV2D_H
-#define ARM_COMPUTE_CLCONV2D_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
- *
- * -# @ref opencl::ClGemmConv2d
- * -# @ref opencl::ClWinogradConv2d
- * -# @ref opencl::ClDirectConv2d
- * -# @ref CLFFTConvolutionLayer
- *
- * The function selects one of the algorithms mentioned above based on:
- *      - The size of the kernel
- *      - Number of src/dst feature maps
- *      - Amount of memory needed
- *
- * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
- *
- * FP32 Algorithm| Filter Size                                                 |   Input/Output feature maps               |
- * --------------|-------------------------------------------------------------|-------------------------------------------|
- * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7                 |  Input channels is greater than 3         |
- * FFT           | Squared kernels and greater than 9x9                        |  Input feature maps > Output feature maps |
- * DirectConv    | 9x9                                                         |                                           |
- * GEMM          | Any size                                                    |                                           |
- *
- * Winograd 5x5 requires fast maths enabled.
- *
- * FP16 Algorithm| Filter Size                |   Input/Output feature maps               |
- * --------------|----------------------------|-------------------------------------------|
- * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5    |  Input channels is greater than 3         |
- * FFT           | Not supported              |                                           |
- * DirectConv    | 9x9                        |                                           |
- * GEMM          | Any size                   |                                           |
- *
- * Winograd FP16 requires fast maths enabled.
- *
- */
-class ClConv2d : public IClOperator
-{
-public:
-    /** Default constructor */
-    ClConv2d();
-    /** Default Destructor */
-    ~ClConv2d();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ClConv2d(const ClConv2d &) = delete;
-    /** Default move constructor */
-    ClConv2d(ClConv2d &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ClConv2d &operator=(const ClConv2d &) = delete;
-    /** Default move assignment operator */
-    ClConv2d &operator=(ClConv2d &&) = default;
-    /** Set the src and dst tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2   |dst            |
-     * |:--------------|:------------------|:------|:--------------|
-     * |F16            |F16                |F16    |F16            |
-     * |F32            |F32                |F32    |F32            |
-     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of srcs.
-     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Same as @p src, except for src of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst             Destination tensor info. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                             Data types supported: Same as @p src.
-     * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
-     * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
-     *
-     * Similar to ClConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                           const WeightsInfo &weights_info = WeightsInfo());
-    /** Static function to check if given info will return the convolution called by @ref ClConv2d
-     *
-     * @param[in] src          Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of srcs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                         Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] dst          Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                         Data types supported: Same as @p src.
-     * @param[in] conv2d_info  Contains convolution 2d info described in @ref Conv2dInfo.
-     * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel.
-     * @param[in] gpu_target   Specifies the @p GPUTarget.
-     *
-     * @return the Convolution Method Hint
-     */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                    const WeightsInfo &weights_info, const GPUTarget gpu_target);
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    std::unique_ptr<IClOperator>     _operator;
-    experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCONV2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
deleted file mode 100644
index 0d2f2925d3..0000000000
--- a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
-    auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
-    k->configure(compile_context, src, dst, original_src_shape, data_layout);
-    _kernel = std::move(k);
-}
-
-Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
-{
-    return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h
deleted file mode 100644
index 7ea35c5a8a..0000000000
--- a/src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
-#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */
-class ClConvertFullyConnectedWeights : public IClOperator
-{
-public:
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in] compile_context    The compile context to be used.
-     * @param[in] src                The src tensor info. Data types supported: All.
-     * @param[in] dst                The dst tensor info. Data types supported: Same as @p src
-     * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in] data_layout        The data layout the weights have been trained in.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClConvertFullyConnectedWeights::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/runtime/gpu/cl/operators/ClCopy.cpp b/src/runtime/gpu/cl/operators/ClCopy.cpp
deleted file mode 100644
index 2bdb1f5ba1..0000000000
--- a/src/runtime/gpu/cl/operators/ClCopy.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClCopy.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
-{
-    auto k = std::make_unique<kernels::ClCopyKernel>();
-    k->configure(compile_context, src, dst, dst_window);
-    _kernel = std::move(k);
-}
-
-Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window)
-{
-    return kernels::ClCopyKernel::validate(src, dst, dst_window);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClCopy.h b/src/runtime/gpu/cl/operators/ClCopy.h
deleted file mode 100644
index e8ea8125eb..0000000000
--- a/src/runtime/gpu/cl/operators/ClCopy.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COPY_H
-#define ARM_COMPUTE_CL_COPY_H
-
-#include "arm_compute/core/Window.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClCopyKernel */
-class ClCopy : public IClOperator
-{
-public:
-    /** Initialise the function's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: All.
-     * @param[out] dst             Output tensor info. Data types supported: Same as @p src.
-     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCopy::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_COPY_H */
diff --git a/src/runtime/gpu/cl/operators/ClCrop.cpp b/src/runtime/gpu/cl/operators/ClCrop.cpp
deleted file mode 100644
index 17bb11912f..0000000000
--- a/src/runtime/gpu/cl/operators/ClCrop.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClCrop.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClCropKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
-{
-    auto k = std::make_unique<kernels::ClCropKernel>();
-    k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window);
-    _kernel = std::move(k);
-}
-
-Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
-{
-    return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClCrop.h b/src/runtime/gpu/cl/operators/ClCrop.h
deleted file mode 100644
index cca69d6d77..0000000000
--- a/src/runtime/gpu/cl/operators/ClCrop.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CROP_H
-#define ARM_COMPUTE_CL_CROP_H
-
-#include "arm_compute/core/Window.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClCropKernel */
-class ClCrop : public IClOperator
-{
-public:
-    /** Initialise the function's source and destination.
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
-     * @param[out] dst                 Destination tensor info. Data type supported: F32
-     * @param[in]  start               Coordinates of where to start cropping the image.
-     * @param[in]  end                 Coordinates of where to end cropping the image.
-     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCrop::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CROP_H */
diff --git a/src/runtime/gpu/cl/operators/ClDequantize.cpp b/src/runtime/gpu/cl/operators/ClDequantize.cpp
deleted file mode 100644
index 0c1391bb45..0000000000
--- a/src/runtime/gpu/cl/operators/ClDequantize.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDequantize.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClDequantizeKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClDequantizeKernel::validate(src, dst);
-}
-
-void ClDequantize::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDequantize.h b/src/runtime/gpu/cl/operators/ClDequantize.h
deleted file mode 100644
index 5bcdcb2113..0000000000
--- a/src/runtime/gpu/cl/operators/ClDequantize.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H
-#define ARM_COMPUTE_CL_DEQUANTIZE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */
-class ClDequantize : public IClOperator
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst             Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClDequantize::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp b/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
deleted file mode 100644
index 13ef42a640..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConv2d.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClDirectConv2d.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace
-{
-ITensorPack select_activation_src_dst(ITensorPack &tensors)
-{
-    ITensorPack pack;
-    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
-    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
-    return pack;
-}
-} // namespace
-
-void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-
-    // Configure direct convolution kernel
-    const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
-    auto                      k               = std::make_unique<kernels::ClDirectConv2dKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info);
-    _direct_conv_kernel = std::move(k);
-
-    // Configure border handler
-    PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
-    _src_border_handler = std::move(b);
-
-    // Fused activation is currently supported for NHWC and floating point types
-    if(act_info.enabled() && !conv2d_act_info.enabled())
-    {
-        auto a = std::make_unique<kernels::ClActivationKernel>();
-        a->configure(compile_context, dst, dst, act_info);
-        _activation_kernel = std::move(a);
-    }
-
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
-}
-
-Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target()));
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
-    }
-    return Status{};
-}
-
-void ClDirectConv2d::run(ITensorPack &tensors)
-{
-    // Run border handler
-    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
-    // Run direct convolution
-    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
-    // Run activation kernel
-    if(_activation_kernel)
-    {
-        auto act_pack = select_activation_src_dst(tensors);
-        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
-    }
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClDirectConv2d.h b/src/runtime/gpu/cl/operators/ClDirectConv2d.h
deleted file mode 100644
index a2785b52e3..0000000000
--- a/src/runtime/gpu/cl/operators/ClDirectConv2d.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
-#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClDirectConv2d
- */
-class ClDirectConv2d : public IClOperator
-{
-public:
-    ClDirectConv2d() = default;
-    /** Set the src and dst tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of srcs.
-     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
-     *                             Data types supported: Same as @p src.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClDirectConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp b/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp
deleted file mode 100644
index e5b836a0d8..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseOperations.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClElementwiseOperations.h"
-
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
-}
-
-void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
-}
-
-void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
-}
-
-void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
-}
-
-void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseOperations.h b/src/runtime/gpu/cl/operators/ClElementwiseOperations.h
deleted file mode 100644
index c01b107d97..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseOperations.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs an arithmetic division between two tensors.
- */
-class ClElementwiseDivision : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: F16/F32.
-     * @param[in]  src2            Second source tensor info. same as @p src1.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClElementwiseDivision::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
- * @note The function performs a max operation between two tensors.
- */
-class ClElementwiseMax : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClElementwiseMax::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
- * @note The function performs a max operation between two tensors.
- */
-class ClElementwiseMin : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClElementwiseMin::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
- *
- * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
- * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
- */
-class ClElementwiseSquaredDiff : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClElementwiseSquaredDiff::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
- */
-class ClElementwisePower : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: F16/F32.
-     * @param[in]  src2            Second source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClElementwisePower::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp b/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp
deleted file mode 100644
index 7b830a077f..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseUnary.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClElementwiseUnary.h"
-
-#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT);
-    _kernel = std::move(k);
-}
-
-Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT);
-}
-
-void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::EXP);
-    _kernel = std::move(k);
-}
-
-Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP);
-}
-
-void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::NEG);
-    _kernel = std::move(k);
-}
-
-Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG);
-}
-
-void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::SIN);
-    _kernel = std::move(k);
-}
-
-Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN);
-}
-
-void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::ABS);
-    _kernel = std::move(k);
-}
-
-Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS);
-}
-
-void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::LOG);
-    _kernel = std::move(k);
-}
-
-Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG);
-}
-
-void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::ROUND);
-    _kernel = std::move(k);
-}
-
-Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClElementwiseUnary.h b/src/runtime/gpu/cl/operators/ClElementwiseUnary.h
deleted file mode 100644
index b9acf6f5b8..0000000000
--- a/src/runtime/gpu/cl/operators/ClElementwiseUnary.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to perform inverse square root on an src tensor. */
-class ClRsqrt : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClRsqrt::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to perform exponential on an src tensor. */
-class ClExp : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClExp::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to negate an src tensor. */
-class ClNeg : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClNeg::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to calculate sine of an src tensor. */
-class ClSin : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClSin::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to perform elementwise log on an src tensor. */
-class ClLog : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLog::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to get the absolute value of an src tensor. */
-class ClAbs : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClAbs::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-
-/** Basic function to get the round (to the nearest even) value of an src tensor. */
-class ClRound : public IClOperator
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClRound::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */
diff --git a/src/runtime/gpu/cl/operators/ClFill.cpp b/src/runtime/gpu/cl/operators/ClFill.cpp
deleted file mode 100644
index 4d0afaef24..0000000000
--- a/src/runtime/gpu/cl/operators/ClFill.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFill.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClFillKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
-{
-    auto k = std::make_unique<kernels::ClFillKernel>();
-    k->configure(compile_context, tensor, constant_value, dst_window);
-    _kernel = std::move(k);
-}
-
-Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
-{
-    return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClFill.h b/src/runtime/gpu/cl/operators/ClFill.h
deleted file mode 100644
index cc79b915a7..0000000000
--- a/src/runtime/gpu/cl/operators/ClFill.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FILL_H
-#define ARM_COMPUTE_CL_FILL_H
-
-#include "arm_compute/core/Window.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClFillKernel */
-class ClFill : public IClOperator
-{
-public:
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Source tensor info. Supported data types: All.
-     * @param[in]     constant_value  The value used to fill the planes of the tensor
-     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClFill::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FILL_H */
diff --git a/src/runtime/gpu/cl/operators/ClFlatten.cpp b/src/runtime/gpu/cl/operators/ClFlatten.cpp
deleted file mode 100644
index 060b653dee..0000000000
--- a/src/runtime/gpu/cl/operators/ClFlatten.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFlatten.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClReshapeKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClReshapeKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClFlatten.h b/src/runtime/gpu/cl/operators/ClFlatten.h
deleted file mode 100644
index 8bd619b518..0000000000
--- a/src/runtime/gpu/cl/operators/ClFlatten.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FLATTEN_H
-#define ARM_COMPUTE_CL_FLATTEN_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to flatten a given input */
-class ClFlatten : public IClOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src            |dst            |
-     * |:--------------|:--------------|
-     * |All            |All            |
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             Source tensor to flatten with at least 3 dimensions.
-     *                            The dimensions above the third will be interpreted as batches. Data types supported: All
-     * @param[in] dst             Destination tensor with shape [w*h*d, input_batches] where:
-     *                            w = width input tensor, h = height input tensor and d = depth input tensor.
-     *                            Data type supported: same as @p src
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClFlatten::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FLATTEN_H */
diff --git a/src/runtime/gpu/cl/operators/ClFloor.cpp b/src/runtime/gpu/cl/operators/ClFloor.cpp
deleted file mode 100644
index 94e77c0c54..0000000000
--- a/src/runtime/gpu/cl/operators/ClFloor.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFloor.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClFloorKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClFloorKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClFloorKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClFloor.h b/src/runtime/gpu/cl/operators/ClFloor.h
deleted file mode 100644
index 90bdee6c7e..0000000000
--- a/src/runtime/gpu/cl/operators/ClFloor.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FLOOR_H
-#define ARM_COMPUTE_CL_FLOOR_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClFloorKernel */
-class ClFloor : public IClOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             Source tensor info. Data types supported: F16/F32.
-     * @param[in] dst             Destination tensor info. Data type supported: same as @p src
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClFloor::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FLOOR_H */
diff --git a/src/runtime/gpu/cl/operators/ClFullyConnected.cpp b/src/runtime/gpu/cl/operators/ClFullyConnected.cpp
deleted file mode 100644
index 377168d864..0000000000
--- a/src/runtime/gpu/cl/operators/ClFullyConnected.cpp
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClFullyConnected.h"
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
-#include "src/runtime/gpu/cl/operators/ClFlatten.h"
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
-#include "src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
-#include "src/runtime/gpu/cl/operators/ClTranspose.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "support/Cast.h"
-
-#include <algorithm>
-
-namespace arm_compute
-{
-namespace opencl
-{
-using namespace arm_compute::experimental;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
-{
-    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset     = 0;
-    gemmlowp_output_stage.gemmlowp_multiplier = 0;
-    gemmlowp_output_stage.gemmlowp_shift      = 0;
-
-    const auto data_type = src.data_type();
-
-    // Configure output stage for quantized case
-    if(is_data_type_quantized_asymmetric(data_type))
-    {
-        const QuantizationInfo        oq_info = dst.quantization_info();
-        const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
-        const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
-        const UniformQuantizationInfo oq_unif = oq_info.uniform();
-
-        const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif;
-
-        const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
-        int         output_multiplier = 0;
-        int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-
-        if(activation_info.enabled())
-        {
-            std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
-        }
-
-        // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
-        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
-        gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
-        gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
-        type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
-        type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
-    }
-
-    return Status{};
-}
-
-Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info)
-{
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
-                                         false,                           // is_b_reshaped
-                                         true,                            // reshape_b_only_on_first_run
-                                         0,                               // depth_output_gemm3d
-                                         false,                           // reinterpret_input_as_3d
-                                         fc_info.retain_internal_weights, // retain_internal_weights
-                                         gemmlowp_output_stage,           // gemmlowp_output_stage
-                                         fc_info.fp_mixed_precision,      // fp_mixed_precision
-                                         false,                           // fast_math
-                                         true,                            // broadcast_bias
-                                         ActivationLayerInfo());          // activation_info
-
-    if(is_data_type_quantized_asymmetric(src.data_type()))
-    {
-        const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
-
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset);
-        const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
-
-        // Validate gemmlowp function
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
-                                                                           &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                           bias,
-                                                                           &dst,
-                                                                           gemm_info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info));
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClFullyConnected::ClFullyConnected()
-    : _convert_weights(nullptr),
-      _flatten(nullptr),
-      _reshape_weights(nullptr),
-      _mm_gemm(nullptr),
-      _mm_gemmlowp(nullptr),
-      _aux_mem(Count)
-{
-}
-
-ClFullyConnected::~ClFullyConnected() = default;
-
-void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
-                                    const FullyConnectedLayerInfo &fc_info)
-{
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info);
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
-                                         false,                           // is_b_reshaped
-                                         true,                            // reshape_b_only_on_first_run
-                                         0,                               // depth_output_gemm3d
-                                         false,                           // reinterpret_input_as_3d
-                                         fc_info.retain_internal_weights, // retain_internal_weights
-                                         gemmlowp_output_stage,           // gemmlowp_output_stage
-                                         fc_info.fp_mixed_precision,      // fp_mixed_precision
-                                         false,                           // fast_math
-                                         true,                            // broadcast_bias
-                                         fc_info.activation_info,         // activation_info
-                                         fc_info.constant_weights);       // constant_weights
-
-    if(_is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo src_quantization_info     = src->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->quantization_info();
-
-        TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
-        TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
-
-        src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
-        weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-        // Configure gemmlowp function
-        _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info);
-    }
-    else
-    {
-        // Configure matrix multiply kernel
-        _mm_gemm = std::make_unique<ClGemm>();
-        _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info);
-    }
-}
-
-void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
-                                         const FullyConnectedLayerInfo &fc_info)
-{
-    ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for flatten
-    _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
-
-    // Configure flatten kernel
-    _flatten = std::make_unique<ClFlatten>();
-    _flatten->configure(compile_context, src, &_flattened_src);
-
-    // Configure matrix multiply kernel
-    configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
-}
-
-void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
-                                       const FullyConnectedLayerInfo &fc_info)
-{
-    ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
-
-    // Configure matrix multiply kernel
-    configure_mm(compile_context, src, weights, bias, dst, fc_info);
-}
-
-void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 FullyConnectedLayerInfo fc_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
-
-    _are_weights_converted = true;
-    _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv      = true;
-    _is_quantized          = is_data_type_quantized_asymmetric(src->data_type());
-    _is_prepared           = fc_info.retain_internal_weights;
-    _weights_to_use        = TensorInfo(*weights);
-    _weights_to_use_idx    = ACL_SRC_1;
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    if(is_batched_fc_layer)
-    {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                  src->tensor_shape().cend(),
-                                                                                  dst->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        _is_fc_after_conv = src->num_dimensions() > 1;
-    }
-
-    ITensorInfo *weights_used = weights;
-
-    // Reshape weights if needed
-    if(!_are_weights_reshaped)
-    {
-        // Reshape the weights
-        _reshape_weights = std::make_unique<ClTranspose>();
-        _reshape_weights->configure(compile_context, weights, &_reshaped_weights);
-        weights_used        = &_reshaped_weights;
-        _weights_to_use_idx = offset_int_vec(TransposedWeights);
-    }
-
-    // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
-    {
-        // Convert weights
-        _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
-        _convert_weights->configure(compile_context,
-                                    weights_used,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
-                                    fc_info.weights_trained_layout);
-
-        weights_used           = &_converted_weights;
-        _weights_to_use_idx    = offset_int_vec(ConvertedWeights);
-        _are_weights_converted = false;
-    }
-
-    if(_is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info);
-    }
-    // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
-    _weights_to_use = *weights_used;
-
-    // Set auxiliary memory requirements
-    auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-    for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
-    {
-        _aux_mem[i] = gemm_mem_req[i];
-    }
-    if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
-    {
-        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
-    }
-    else
-    {
-        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-        const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-
-        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size());
-        _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size());
-    }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
-}
-
-Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  FullyConnectedLayerInfo fc_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-    ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights));
-
-    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    bool is_fc_after_conv = true;
-
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const ITensorInfo *src_to_use     = src;
-    const ITensorInfo *weights_to_use = weights;
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    if(is_batched_fc_layer)
-    {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                 src->tensor_shape().cend(),
-                                                                                 dst->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        is_fc_after_conv = src->num_dimensions() > 1;
-    }
-
-    if(!weights_reshaped)
-    {
-        // Validate reshape weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
-        weights_to_use = &reshaped_weights;
-    }
-
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
-    {
-        // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             src->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
-        weights_to_use = &converted_weights;
-    }
-
-    if(is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
-
-        // Validate flatten kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
-        src_to_use = &flatten_src;
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
-    }
-
-    // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info));
-
-    return Status{};
-}
-
-void ClFullyConnected::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto src = tensors.get_const_tensor(ACL_SRC_0);
-
-    CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
-    CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
-
-    // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
-    {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
-        _flatten->run(flatten_pack);
-    }
-
-    ITensorPack gemm_pack = tensors;
-    gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_weights_to_use_idx != ACL_SRC_1)
-    {
-        gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
-    }
-
-    // Run matrix multiply
-    if(_is_quantized)
-    {
-        _mm_gemmlowp->run(gemm_pack);
-    }
-    else
-    {
-        _mm_gemm->run(gemm_pack);
-    }
-}
-
-void ClFullyConnected::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto weights = tensors.get_const_tensor(ACL_SRC_1);
-
-        CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
-        CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
-
-        // Pointer to current weights
-        const ITensor *cur_weights = weights;
-
-        // Reshape of the weights if needed (happens only once)
-        if(!_are_weights_reshaped)
-        {
-            // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
-            _reshape_weights->run(transpose_pack);
-
-            cur_weights->mark_as_unused();
-            cur_weights = reshaped_weights.get();
-
-            _are_weights_reshaped = true;
-        }
-
-        // Convert weights if needed (happens only once)
-        if(!_are_weights_converted)
-        {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
-            _convert_weights->run(convert_pack);
-
-            cur_weights->mark_as_unused();
-            cur_weights = converted_weights.get();
-
-            _are_weights_converted = true;
-        }
-
-        tensors.add_const_tensor(ACL_SRC_1, cur_weights);
-
-        // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized)
-        {
-            _mm_gemm->prepare(tensors);
-        }
-        else
-        {
-            _mm_gemmlowp->prepare(tensors);
-        }
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements ClFullyConnected::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClFullyConnected.h b/src/runtime/gpu/cl/operators/ClFullyConnected.h
deleted file mode 100644
index 86f95756d5..0000000000
--- a/src/runtime/gpu/cl/operators/ClFullyConnected.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H
-#define ARM_COMPUTE_CL_FULLY_CONNECTED_H
-
-#include "arm_compute/core/TensorInfo.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-// Forward declarations
-class ClConvertFullyConnectedWeights;
-class ClFlatten;
-class ClGemm;
-class ClGemmLowpMatrixMultiplyCore;
-class ClTranspose;
-
-/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
- *
- *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref opencl::kernels::ClGemmMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class ClFullyConnected : public IClOperator
-{
-public:
-    ClFullyConnected();
-    ~ClFullyConnected();
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2   |dst            |
-     * |:--------------|:------------------|:------|:--------------|
-     * |F16            |F16                |F16    |F16            |
-     * |F32            |F32                |F32    |F32            |
-     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights         Weights tensor. The weights must be 2 dimensional.
-     *                             If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                             If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                             Data type supported: Same as @p src.
-     * @param[in]  biases          Bias tensor. Can be nullptr. Data type supported:Same as @p src.
-     * @param[out] dst             Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
-     *                             - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                             - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                             Data type supported: Same as @p src.
-     * @param[in]  fc_info         (Optional) Fully connected layer additional info
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClFullyConnected::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-    // Inherited methods overriden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-
-private:
-    enum AuxTensorIdx
-    {
-        TransposedWeights = 10,
-        ConvertedWeights  = 11,
-        FlattenedSrc      = 12,
-        Count             = 13
-    };
-
-    std::unique_ptr<ClConvertFullyConnectedWeights> _convert_weights;
-    std::unique_ptr<ClFlatten>                      _flatten;
-    std::unique_ptr<ClTranspose>                    _reshape_weights;
-    std::unique_ptr<ClGemm>                         _mm_gemm;
-    std::unique_ptr<ClGemmLowpMatrixMultiplyCore>   _mm_gemmlowp;
-
-    experimental::MemoryRequirements _aux_mem{};
-
-    TensorInfo _flattened_src{};
-    TensorInfo _converted_weights{};
-    TensorInfo _reshaped_weights{};
-
-    TensorInfo _weights_to_use{};
-    int        _weights_to_use_idx{ ACL_SRC_1 };
-
-    bool _are_weights_converted{ true };
-    bool _are_weights_reshaped{ true };
-    bool _is_fc_after_conv{ true };
-    bool _is_quantized{ false };
-    bool _is_prepared{ false };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */
diff --git a/src/runtime/gpu/cl/operators/ClGemm.cpp b/src/runtime/gpu/cl/operators/ClGemm.cpp
deleted file mode 100644
index 59bbabba26..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemm.cpp
+++ /dev/null
@@ -1,771 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Log.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-#include "src/common/utils/Log.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
-#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "support/Cast.h"
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-using namespace arm_compute::experimental;
-using namespace arm_compute::utils::cast;
-using namespace arm_compute::opencl::kernels;
-
-namespace
-{
-inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
-{
-    switch(kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE_V1:
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        case CLGEMMKernelType::RESHAPED_V1:
-        case CLGEMMKernelType::RESHAPED:
-        {
-            return true;
-        }
-        default:
-        {
-            return false;
-        }
-    }
-}
-//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
-{
-    if(!constant_weights)
-    {
-        return CLGEMMKernelType::NATIVE_V1;
-    }
-
-    auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
-    {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
-        {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
-            return gemm_kernel.gemm_type;
-        }
-    }
-    gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
-    return gemm_kernel.gemm_type;
-}
-// Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                                    const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
-{
-    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
-    TensorInfo tmp_b_info{};
-    // Validate reshape RHS kernel
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
-    {
-        return false;
-    }
-    // Validate mm kernel
-    gemm_kernel_info.lhs_info  = lhs_info;
-    gemm_kernel_info.rhs_info  = rhs_info;
-    gemm_kernel_info.has_pad_y = false;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
-    {
-        return false;
-    }
-    gemm_kernel_info.has_pad_y = true;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
-    {
-        return false;
-    }
-    return true;
-}
-
-//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
-                                                                                                 const ITensorInfo *b,
-                                                                                                 const ITensorInfo *c, const ITensorInfo *output)
-{
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
-    {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
-        {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
-        }
-    }
-    config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
-}
-
-// Validate lhs_info and rhs_info for reshaped kernel
-inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                           const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
-{
-    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-
-    // Validate reshape LHS kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
-    if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
-    {
-        return false;
-    }
-
-    // Validate reshape RHS kernel
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
-    {
-        return false;
-    }
-    // Validate mm kernel
-    gemm_kernel_info.lhs_info = lhs_info;
-    gemm_kernel_info.rhs_info = rhs_info;
-    if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
-    {
-        return false;
-    }
-    return true;
-}
-
-//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
-                                                                                        const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
-{
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
-    if(config)
-    {
-        if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
-        {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
-        }
-    }
-    config = auto_heuristics::select_default_gemm_config_reshaped(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
-}
-} // namespace
-
-ClGemm::ClGemm()
-    : _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()),
-      _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),
-      _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
-      _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
-      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
-      _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
-      _tmp_a(),
-      _tmp_b(),
-      _reshape_b_only_on_first_run(false),
-      _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1),
-      _is_prepared(false),
-      _aux_mem(AuxTensorIdx::Count)
-{
-}
-
-void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                 const GEMMInfo &gemm_info)
-{
-    const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n          = b->dimension(0);
-    const unsigned int k          = a->dimension(0);
-    const GPUTarget    gpu_target = CLScheduler::get().target();
-
-    // Set the target for the kernels
-    _mm_kernel->set_target(gpu_target);
-
-    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
-
-    // Configure and tune matrix multiply kernel
-    _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
-
-    // Tune kernel statically
-    CLScheduler::get().tune_kernel_static(*_mm_kernel);
-}
-
-void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                   const GEMMInfo &gemm_info)
-{
-    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                         = b->dimension(0);
-    const unsigned int k                         = a->dimension(0);
-    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target                = CLScheduler::get().target();
-    int                mult_transpose1xW_width   = 1;
-    int                mult_interleave4x4_height = 1;
-
-    // Set the target for the kernels
-    _reshape_lhs_kernel->set_target(gpu_target);
-    _mm_kernel->set_target(gpu_target);
-
-    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
-    {
-        mult_transpose1xW_width   = 4;
-        mult_interleave4x4_height = 2;
-    }
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = 16 / b->element_size();
-    rhs_info.k0         = 1;
-    rhs_info.h0         = mult_transpose1xW_width;
-    rhs_info.interleave = false;
-    rhs_info.transpose  = false;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = 4;
-    lhs_info.k0         = 4;
-    lhs_info.v0         = mult_interleave4x4_height;
-    lhs_info.interleave = true;
-    lhs_info.transpose  = true;
-
-    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
-
-    // Configure interleave kernel
-    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
-
-    // Configure transpose kernel
-    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
-
-    // Configure and tune matrix multiply kernel
-    _mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
-
-    CLScheduler::get().tune_kernel_static(*_mm_kernel);
-
-    // Request memory for LHS and RHS reshape matrix
-    _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-}
-
-void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                   const GEMMInfo &gemm_info)
-{
-    DataType           data_type               = a->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = false;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    // Set the target for the kernels
-    _reshape_lhs_kernel->set_target(gpu_target);
-    _mm_kernel->set_target(gpu_target);
-
-    GEMMLHSMatrixInfo lhs_info{};
-    GEMMRHSMatrixInfo rhs_info{};
-
-    // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
-                                                                    c, output, gemm_info.reinterpret_input_as_3d());
-
-    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
-
-    // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    // Request memory for LHS and RHS reshape matrix
-    _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-}
-
-void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                         const GEMMInfo &gemm_info)
-{
-    DataType           data_type               = a->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    // Set the target for the kernels
-    _mm_kernel->set_target(gpu_target);
-
-    GEMMLHSMatrixInfo lhs_info{};
-    GEMMRHSMatrixInfo rhs_info{};
-
-    // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
-
-    // Transpose matrix
-    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
-
-    // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
-    // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
-    // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
-
-    // Configure matrix multiply kernel with no y padding support
-    kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    // Configure matrix multiply kernel with y padding support
-    kernel_info.has_pad_y = true;
-    _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-}
-
-Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    // Get the GPU target
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,
-                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
-
-    return Status{};
-}
-
-Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-
-    // Get the GPU target
-    const GPUTarget    gpu_target                = CLScheduler::get().target();
-    const unsigned int m                         = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                         = b->dimension(0);
-    const unsigned int k                         = a->dimension(0);
-    int                mult_transpose1xW_width   = 1;
-    int                mult_interleave4x4_height = 1;
-    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-
-    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
-    {
-        mult_transpose1xW_width   = 4;
-        mult_interleave4x4_height = 2;
-    }
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = 16 / b->element_size();
-    rhs_info.k0         = 1;
-    rhs_info.h0         = mult_transpose1xW_width;
-    rhs_info.interleave = false;
-    rhs_info.transpose  = false;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = 4;
-    lhs_info.k0         = 4;
-    lhs_info.v0         = mult_interleave4x4_height;
-    lhs_info.interleave = true;
-    lhs_info.transpose  = true;
-
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
-
-    // Validate interleave kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-    // Validate transpose kernel
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
-                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
-
-    return Status{};
-}
-
-Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-
-    // Get the GPU target
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    DataType           data_type               = a->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = false;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Pick up the GEMM configuration
-    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
-
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
-    return Status{};
-}
-
-Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    TensorInfo tmp_b_info{};
-
-    // Get the GPU target
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    const DataType     data_type               = a->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Pick up the GEMM configuration
-    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
-
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-    // Validate matrix multiply
-    kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
-    kernel_info.has_pad_y = true;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
-    return Status{};
-}
-
-void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = gemm_info.retain_internal_weights();
-
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-
-    // Select GEMMType
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
-                                                gemm_info.constant_weights());
-
-    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
-
-    ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
-
-    switch(_gemm_kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE_V1:
-        {
-            configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_V1:
-        {
-            configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED:
-        {
-            configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("GEMMType not supported");
-        }
-    }
-}
-
-Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    // Get the GPU target
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-
-    // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
-    {
-        CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
-    },
-    gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights());
-
-    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
-
-    const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
-
-    switch(gemm_kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE_V1:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_V1:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
-        }
-    }
-
-    return Status{};
-}
-
-void ClGemm::run(ITensorPack &tensors)
-{
-    const ITensor *lhs  = tensors.get_const_tensor(ACL_SRC_0);
-    const ITensor *rhs  = tensors.get_const_tensor(ACL_SRC_1);
-    const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2);
-    ITensor       *dst  = tensors.get_tensor(ACL_DST);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);
-
-    CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);
-    CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
-
-    // Prepare the consts if needed
-    prepare(tensors);
-
-    // Run matrix multiply kernel
-    switch(_gemm_kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE_V1:
-        {
-            CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_V1:
-        case CLGEMMKernelType::RESHAPED:
-        {
-            // Run interleave kernel
-            ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
-            CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
-
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
-                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
-            }
-
-            ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
-
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
-            {
-                CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
-            }
-            else
-            {
-                CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true);
-            }
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
-                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
-            }
-            // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
-            // Check if the lhs or dst tensors have padding
-            const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
-            const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
-            bool               has_pad_y           = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
-
-            ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } };
-            if(has_pad_y)
-            {
-                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);
-            }
-            else
-            {
-                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);
-            }
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("GEMMType not supported");
-        }
-    }
-}
-
-void ClGemm::prepare(ITensorPack &constants)
-{
-    if(!_is_prepared)
-    {
-        const ITensor *src1    = constants.get_const_tensor(ACL_SRC_1);
-        ICLTensor     *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
-
-        // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
-        if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
-        {
-            ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
-
-            CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
-            ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
-
-            ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
-            CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
-        }
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements ClGemm::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClGemm.h b/src/runtime/gpu/cl/operators/ClGemm.h
deleted file mode 100644
index 254344e862..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemm.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_H
-#define ARM_COMPUTE_CL_GEMM_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTypes.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
- *
- *  -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
- *  -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- *  -# @ref kernels::ClGemmMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method())
- *  -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method())
- *  -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- */
-class ClGemm : public IClOperator
-{
-public:
-    /** Constructor */
-    ClGemm();
-    /** Initialise the kernel's inputs and output
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0         |src1        |src2      |dst            |
-     * |:------------|:-----------|:---------|:--------------|
-     * |F32          |F32         |F32       |F32            |
-     * |F16          |F16         |F16       |F16            |
-     *
-     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     *
-     * @note All tensors must have the same data type.
-     *
-     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  a               First input tensor  (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output          Output tensor. Data type supported: same as @p a
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of matrix C
-     * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
-     *                       in case matrix A and matrix B have been already transformed.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClGemm::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    void configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
-private:
-    enum AuxTensorIdx
-    {
-        LhsReshape = 0,
-        RhsReshape,
-        Count
-    };
-
-private:
-    std::unique_ptr<kernels::ClGemmMatrixMultiplyKernel>                _mm_kernel;
-    std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel>              _reshape_lhs_kernel;
-    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>              _reshape_rhs_kernel;
-    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel>        _mm_reshaped_kernel;
-    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel;
-    TensorInfo                                                          _tmp_a;
-    TensorInfo                                                          _tmp_b;
-    bool                                                                _reshape_b_only_on_first_run;
-    CLGEMMKernelType                                                    _gemm_kernel_type;
-    bool                                                                _is_prepared;
-    experimental::MemoryRequirements                                    _aux_mem{};
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMM_H */
diff --git a/src/runtime/gpu/cl/operators/ClGemmConv2d.cpp b/src/runtime/gpu/cl/operators/ClGemmConv2d.cpp
deleted file mode 100644
index 8c796e0712..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemmConv2d.cpp
+++ /dev/null
@@ -1,628 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClGemmConv2d.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-#include "src/core/gpu/cl/kernels/ClCol2ImKernel.h"
-#include "src/core/gpu/cl/kernels/ClIm2ColKernel.h"
-#include "src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
-#include "src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-using namespace experimental;
-using namespace misc::shape_calculator;
-using namespace utils::cast;
-namespace opencl
-{
-ClGemmConv2d::ClGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
-{
-}
-ClGemmConv2d::~ClGemmConv2d() = default;
-
-void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
-                                         false,                 // is_b_reshaped
-                                         true,                  // reshape_b_only_on_first_run
-                                         gemm_3d_depth,         // depth_output_gemm3d
-                                         _skip_im2col,          // reinterpret_input_as_3d
-                                         false,                 // retain_internal_weights
-                                         gemmlowp_output_stage, // gemmlowp_output_stage
-                                         false,                 // fast_math
-                                         false,                 // fp_mixed_precision
-                                         true,                  // broadcast_bias
-                                         act_info);             // activation_info
-
-    TensorInfo tmp_src{ *src };
-    if(_is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = src->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->quantization_info();
-
-        tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-        _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
-
-        // Revert back QuantizatioInfo as weights could be used in other convolution layers
-        weights->set_quantization_info(weights_quantization_info);
-
-        auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
-        {
-            _aux_mem[cont] = mm_mem_req[cont];
-        }
-    }
-    else
-    {
-        // Configure matrix multiply function
-        _mm_gemm = std::make_unique<ClGemm>();
-        _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
-        auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
-        {
-            _aux_mem[cont] = mm_mem_req[cont];
-        }
-    }
-}
-
-Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
-{
-    const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
-                                         false,                 // is_b_reshaped
-                                         true,                  // reshape_b_only_on_first_run
-                                         gemm_3d_depth,         // depth_output_gemm3d
-                                         skip_im2col,           // reinterpret_input_as_3d
-                                         false,                 // retain_internal_weights
-                                         gemmlowp_output_stage, // gemmlowp_output_stage
-                                         false,                 // fast_math
-                                         false,                 // fp_mixed_precision
-                                         true,                  // broadcast_bias
-                                         act_info);             // activation_info
-
-    if(is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = src->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->quantization_info();
-
-        std::unique_ptr<ITensorInfo> src_qa     = src->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-        // Perform validation step on GEMMLowp
-        return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
-    }
-    else
-    {
-        // Perform validation step on Matrix multiply function
-        return ClGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
-    }
-}
-
-void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                             const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
-                                                      conv2d_info,
-                                                      weights_info));
-
-    const DataType   data_type   = src->data_type();
-    const DataLayout data_layout = src->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-    const unsigned int num_kernels   = weights->dimension(idx_kernels);
-
-    const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-    _is_prepared  = weights_info.retain_internal_weights();
-    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
-    _skip_col2im  = data_layout == DataLayout::NHWC;
-
-    // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
-    _fuse_activation = true;
-
-    const ITensorInfo *gemm_input_to_use  = src;
-    ITensorInfo       *gemm_output_to_use = dst;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
-
-    unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
-
-    ITensorInfo *biases_to_use = biases;
-    _append_bias               = false;
-
-    _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
-    {
-        // num_groups != 1 can only be for NCHW
-        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use = nullptr;
-        _append_bias  = true;
-        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups);
-    }
-    else
-    {
-        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups);
-    }
-
-    // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
-    {
-        // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
-
-        // Set the GPU target for im2col
-        _im2col_kernel->set_target(CLScheduler::get().target());
-        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
-
-        // Set quantization info
-        _im2col_output.set_quantization_info(src->quantization_info());
-        CLScheduler::get().tune_kernel_static(*_im2col_kernel);
-
-        // Update GEMM input
-        gemm_input_to_use = &_im2col_output;
-    }
-
-    // Create GEMM output tensor
-    if(!_skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        // If we cannot skip col2im it means we run im2col as well
-        shape_gemm = _im2col_output.tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        _gemm_output = TensorInfo(shape_gemm, 1, data_type);
-        _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output;
-    }
-
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    gemmlowp_output_stage.type            = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset = 0;
-
-    // Configure output stage for quantized case
-    if(_is_quantized)
-    {
-        const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
-        const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
-        const unsigned int num_filters              = (is_quantized_per_channel) ? num_kernels : 1;
-
-        gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
-
-        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
-        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
-        quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
-                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
-                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
-        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
-        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
-
-        PixelValue min_val{};
-        PixelValue max_val{};
-        std::tie(min_val, max_val) = get_min_max(dst->data_type());
-
-        auto min_activation = min_val.get<int32_t>();
-        auto max_activation = max_val.get<int32_t>();
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-
-        if(conv2d_info.act_info.enabled())
-        {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
-            {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
-            }
-            else
-            {
-                _fuse_activation = false;
-            }
-        }
-
-        // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
-        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
-    }
-
-    // Configure and tune GEMM
-    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
-
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
-
-    if(!_skip_col2im)
-    {
-        // Set the GPU target for col2im
-        _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
-        _col2im_kernel->set_target(CLScheduler::get().target());
-        // Configure and tune Col2Im
-        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups);
-        CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
-                             "Output shape does not match the expected one");
-
-    if(!_fuse_activation)
-    {
-        _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
-        _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
-    }
-
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
-}
-
-Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                              const WeightsInfo &weights_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
-
-    if(!is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW));
-
-    const DataLayout data_layout = src->data_layout();
-    const DataType   data_type   = src->data_type();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-    const unsigned int num_kernels   = weights->dimension(idx_kernels);
-
-    TensorInfo         im2col_reshaped_info{};
-    TensorInfo         info_gemm{};
-    TensorInfo         weights_reshaped_info{};
-    const ITensorInfo *gemm_input_to_use  = src;
-    const ITensorInfo *gemm_output_to_use = dst;
-    const ITensorInfo *weights_to_use     = weights;
-    const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
-    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
-                                             && conv2d_info.conv_info.stride().second == 1);
-    const bool skip_col2im     = data_layout == DataLayout::NHWC;
-    bool       fuse_activation = true;
-
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(conv2d_info.act_info.enabled())
-    {
-        ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
-    }
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
-
-    unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
-
-    const ITensorInfo *biases_to_use = biases;
-    bool               append_bias   = false;
-
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
-    {
-        // num_groups != 1 can only be for NCHW
-        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use         = nullptr;
-        append_bias           = true;
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
-    }
-    else
-    {
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
-    }
-
-    weights_to_use = &weights_reshaped_info;
-
-    if(!skip_im2col)
-    {
-        const Size2D kernel_dims(kernel_width, kernel_height);
-
-        // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups);
-
-        auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups));
-        gemm_input_to_use = &im2col_reshaped_info;
-    }
-
-    // Create GEMM output tensor
-    if(!skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        shape_gemm = gemm_input_to_use->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        info_gemm = TensorInfo(shape_gemm, 1, data_type);
-        info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
-        gemm_output_to_use = &info_gemm;
-    }
-
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    gemmlowp_output_stage.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset          = 0;
-    gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
-
-    if(is_quantized)
-    {
-        const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
-        const auto                    output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info;
-        const unsigned int            num_filters       = (is_quantized_per_channel) ? num_kernels : 1;
-
-        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
-        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
-        quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
-                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
-                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
-        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
-        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
-
-        int min_activation = 0;
-        int max_activation = 0;
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-
-        if(conv2d_info.act_info.enabled())
-        {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
-            {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
-            }
-            else
-            {
-                fuse_activation = false;
-            }
-        }
-
-        // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
-        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
-    }
-
-    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
-
-    // Validate Col2Im
-    if(!skip_col2im)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
-    }
-
-    //Validate Activation Layer
-    if(!fuse_activation)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
-    }
-
-    return Status{};
-}
-
-void ClGemmConv2d::run(ITensorPack &tensors)
-{
-    prepare(tensors);
-
-    auto src                = tensors.get_const_tensor(ACL_SRC_0);
-    auto biases             = tensors.get_const_tensor(ACL_SRC_2);
-    auto dst                = tensors.get_tensor(ACL_DST);
-    auto gemm_input_to_use  = src;
-    auto gemm_output_to_use = dst;
-
-    CLAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
-    CLAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
-    CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
-
-    // Run im2col
-    if(!_skip_im2col)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
-        CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
-        gemm_input_to_use = im2col_output.get();
-    }
-    if(!_skip_col2im)
-    {
-        gemm_output_to_use = gemm_output.get();
-    }
-    ITensorPack pack_mm = tensors;
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-    if(!_append_bias)
-    {
-        pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
-    }
-    pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
-    // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
-    if(_is_quantized)
-    {
-        // Run gemmlowp
-        _mm_gemmlowp->run(pack_mm);
-    }
-    else
-    {
-        // Run gemm
-        _mm_gemm->run(pack_mm);
-    }
-
-    // Reshape output matrix
-    if(!_skip_col2im)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
-        CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
-    }
-
-    //Run Activation Layer if we cannot fuse in GEMM
-    if(!_fuse_activation)
-    {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
-        CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
-    }
-}
-
-void ClGemmConv2d::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        // Run weights reshaping and mark original weights tensor as unused
-        ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
-        CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
-        auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
-
-        if(_append_bias)
-        {
-            const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-            pack.add_const_tensor(TensorType::ACL_BIAS, biases);
-        }
-        CLScheduler::get().enqueue_op(*_weights_reshape_kernel.get(), pack, true);
-        tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-
-        // Prepare GEMM
-        _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
-        _is_prepared = true;
-    }
-}
-experimental::MemoryRequirements ClGemmConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClGemmConv2d.h b/src/runtime/gpu/cl/operators/ClGemmConv2d.h
deleted file mode 100644
index e16d029e71..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemmConv2d.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H
-#define ARM_COMPUTE_CL_GEMM_CONV2D_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-class ClGemm;
-class ClGemmLowpMatrixMultiplyCore;
-namespace kernels
-{
-class ClIm2ColKernel;
-class ClCol2ImKernel;
-class ClWeightsReshapeKernel;
-class ClActivationKernel;
-} // namespace kernels
-
-/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
- *
- * -# @ref opencl::kernels::ClIm2ColKernel
- * -# @ref ClGemm (if the data type is FP32 or FP16)
- * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref ClGemmLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref opencl::kernels::ClCol2ImKernel (if NCHW data layout)
- * -# @ref opencl::kernels::ClActivationKernel
- */
-class ClGemmConv2d : public IClOperator
-{
-public:
-    /** Constructor */
-    ClGemmConv2d();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ClGemmConv2d(const ClGemmConv2d &) = delete;
-    /** Default move constructor */
-    ClGemmConv2d(ClGemmConv2d &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ClGemmConv2d &operator=(const ClGemmConv2d &) = delete;
-    /** Default move assignment operator */
-    ClGemmConv2d &operator=(ClGemmConv2d &&) = default;
-    /**Default destructor */
-    ~ClGemmConv2d();
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2     |dst            |
-     * |:--------------|:------------------|:--------|:--------------|
-     * |F16            |F16                |F16      |F16            |
-     * |F32            |F32                |F32      |F32            |
-     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
-     * @param[in]  biases          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
-     * @param[out] dst             Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
-     * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClGemmConvolution::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info,
-                           const WeightsInfo &weights_info = WeightsInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    /** Configures the appropriate matrix multiply routine
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in]      src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]      weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
-     *                                       QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
-     * @param[in]      biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                                       Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
-     * @param[in, out] dst                   Output tensor info. Data types supported: same as @p input.
-     * @param[in]      gemmlowp_output_stage GEMMLowp output stage info
-     * @param[in]      gemm_3d_depth         Depth of GEMM 3D
-     * @param[in]      act_info              Activation to apply after the matrix multiplication
-     */
-    void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                      const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
-     *
-     * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
-     *                                  QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
-     * @param[in] biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                                  Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
-     * @param[in] dst                   Output tensor info. Data types supported: same as @p input.
-     * @param[in] gemmlowp_output_stage GEMMLowp output stage info
-     * @param[in] gemm_3d_depth         Depth of GEMM 3D
-     * @param[in] skip_im2col           Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
-     * @param[in] act_info              Activation to apply after the matrix multiplication
-     *
-     * @return a status
-     */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
-
-    enum AuxTensorIdx
-    {
-        // ClGemmLowpMatrixMultiplyCore has up to 7 internal tensors
-        Im2ColOutput = 8,
-        WeightsReshaped,
-        GemmOutput,
-        Count
-    };
-
-    std::unique_ptr<kernels::ClWeightsReshapeKernel> _weights_reshape_kernel;
-    std::unique_ptr<kernels::ClIm2ColKernel>         _im2col_kernel;
-    std::unique_ptr<ClGemm>                          _mm_gemm;
-    std::unique_ptr<ClGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
-    std::unique_ptr<opencl::kernels::ClCol2ImKernel> _col2im_kernel;
-    std::unique_ptr<kernels::ClActivationKernel>     _activation_kernel;
-
-    TensorInfo _im2col_output;
-    TensorInfo _weights_reshaped;
-    TensorInfo _gemm_output;
-
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _fuse_activation;
-    bool _append_bias;
-    bool _is_prepared;
-
-    experimental::MemoryRequirements _aux_mem;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
deleted file mode 100644
index 0c72912642..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Log.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/gpu/cl/kernels/ClCastKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-using namespace arm_compute::opencl::kernels;
-using namespace arm_compute::experimental;
-
-namespace
-{
-inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
-{
-    switch(kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE:
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            return true;
-        }
-        default:
-        {
-            return false;
-        }
-    }
-}
-
-//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
-{
-    auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
-    {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
-        {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
-            return gemm_kernel.gemm_type;
-        }
-    }
-    gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
-    return gemm_kernel.gemm_type;
-}
-
-// Validate lhs_info and rhs_info for native kernel
-inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
-{
-    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
-    TensorInfo mm_result_s32_info{};
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
-    // Validate mm kernel
-    // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
-    // NOTE: This assumes:
-    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
-    //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
-    if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
-    {
-        return false;
-    }
-    return true;
-}
-
-// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
-{
-    auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
-    if(config)
-    {
-        if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
-        {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
-        }
-    }
-    config = auto_heuristics::select_default_gemm_config_native(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
-}
-
-// Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                    unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
-{
-    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
-    TensorInfo tmp_b_info{};
-    // Validate reshape RHS kernel
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
-    {
-        return false;
-    }
-    // Validate mm kernel
-    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
-    // NOTE: This assumes:
-    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
-    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
-    GEMMKernelInfo gemm_kernel_info;
-    gemm_kernel_info.m                       = m;
-    gemm_kernel_info.n                       = n;
-    gemm_kernel_info.k                       = k;
-    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    gemm_kernel_info.lhs_info                = lhs_info;
-    gemm_kernel_info.rhs_info                = rhs_info;
-    // Since we ignore the output stage, output data type has to be S32 to pass the validation
-    TensorInfo output_info_copy(*output);
-    output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
-    {
-        return false;
-    }
-    return true;
-}
-
-// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                          const ITensorInfo *a,
-                                                                                          const ITensorInfo *b, const ITensorInfo *output)
-{
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
-    {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
-        {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
-        }
-    }
-    config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
-}
-
-inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
-{
-    switch(kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE:
-            return false;
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-            return true;
-        default:
-            ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
-    }
-}
-} // namespace
-
-ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
-    : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
-      _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
-      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
-      _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
-      _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
-      _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
-      _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
-      _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
-      _aux_mem(AuxTensorIdx::Count)
-{
-}
-
-ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
-
-void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
-                                             ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
-                                             const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
-
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _a_offset                    = a->quantization_info().uniform().offset;
-    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                                   && a->data_type() == DataType::QASYMM8;
-    _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
-    _gemm_info = gemm_info;
-
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Set the target for the kernels
-    _mm_native_kernel->set_target(gpu_target);
-    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
-
-    GEMMRHSMatrixInfo rhs_info;
-    GEMMLHSMatrixInfo lhs_info;
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-
-    const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
-    // Check if we need to reshape the matrix A and matrix B
-    _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
-
-    if(_convert_to_qasymm8)
-    {
-        // Set data type for converted weights
-        _qasymm8_weights = *b;
-        _qasymm8_weights.set_data_type(DataType::QASYMM8);
-        _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
-    }
-
-    ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_is_gemm_reshaped)
-    {
-        matrix_b = &_tmp_b;
-
-        // Pick up the GEMM configuration
-        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                 depth_output_gemm3d,
-                                                                                 a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
-
-        // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
-    }
-
-    // Using default reduction info
-    const GEMMLowpReductionKernelInfo reduction_info {};
-
-    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
-    {
-        _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
-        // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
-    }
-
-    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
-    {
-        _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-        // Configure matrix A reduction kernel
-        _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
-    }
-
-    GEMMKernelInfo gemm_kernel_info;
-    gemm_kernel_info.m                       = m;
-    gemm_kernel_info.n                       = n;
-    gemm_kernel_info.k                       = k;
-    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    gemm_kernel_info.lhs_info                = lhs_info;
-    gemm_kernel_info.rhs_info                = rhs_info;
-    gemm_kernel_info.a_offset                = _a_offset;
-    gemm_kernel_info.b_offset                = _b_offset;
-    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-    {
-        // Configure offset contribution kernel
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
-        _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
-        _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
-
-        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
-        gemmlowp_output_stage.output_data_type        = a->data_type();
-        if(num_filters == 1)
-        {
-            // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
-            // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
-            gemmlowp_output_stage.is_quantized_per_channel = false;
-        }
-
-        gemm_kernel_info.output_stage = gemmlowp_output_stage;
-
-        if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-        {
-            // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
-        }
-        else
-        {
-            _run_output_stage = true;
-
-            if(_is_gemm_reshaped)
-            {
-                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
-            }
-            else
-            {
-                // Pick up the GEMM configuration
-                // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                              a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
-
-                // Configure matrix multiply kernel
-                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
-
-                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                                                    c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
-                                                                    &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
-            }
-        }
-    }
-    else
-    {
-        _run_offset_contribution = true;
-        if(_is_gemm_reshaped)
-        {
-            // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
-        }
-        else
-        {
-            // Pick up the GEMM configuration
-            // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                          a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
-
-            // Configure matrix multiply kernel
-            _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
-        }
-
-        // Configure offset contribution kernel
-        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                               c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
-    }
-
-    // Request memory
-    _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(_is_gemm_reshaped)
-    {
-        // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
-        _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-        _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    }
-    if(_a_offset != 0)
-    {
-        _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
-    }
-    if(_b_offset != 0)
-    {
-        _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    }
-    _aux_mem[ResultS32]   = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
-    _aux_mem[Shifts]      = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
-}
-
-Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    int32_t a_offset = a->quantization_info().uniform().offset;
-    int32_t b_offset = b->quantization_info().uniform().offset;
-
-    const ITensorInfo *matrix_a_info = a;
-
-    TensorInfo        tmp_b_info{};
-    GEMMRHSMatrixInfo rhs_info;
-    GEMMLHSMatrixInfo lhs_info;
-
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-
-    bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
-
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
-    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                              && is_data_type_quantized_asymmetric(a->data_type());
-    TensorInfo weights_info(*b);
-    if(convert_to_qasymm8)
-    {
-        b_offset = -128;
-        weights_info.set_data_type(DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
-    }
-    const ITensorInfo *matrix_b_info = &weights_info;
-    if(reshape_matrix_b)
-    {
-        matrix_b_info = &tmp_b_info;
-
-        // Pick up the GEMM configuration
-        // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-        lhs_info       = res.lhs_info;
-        rhs_info       = res.rhs_info;
-
-        // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
-    }
-
-    TensorInfo info_vector_sum_col{};
-    TensorInfo info_vector_sum_row{};
-
-    const GEMMLowpReductionKernelInfo reduction_info;
-    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if(a_offset != 0)
-    {
-        info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
-
-        // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
-    }
-
-    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(b_offset != 0)
-    {
-        info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-        // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
-    }
-
-    GEMMKernelInfo gemm_kernel_info;
-    gemm_kernel_info.m                       = m;
-    gemm_kernel_info.n                       = n;
-    gemm_kernel_info.k                       = k;
-    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    gemm_kernel_info.lhs_info                = lhs_info;
-    gemm_kernel_info.rhs_info                = rhs_info;
-    gemm_kernel_info.a_offset                = a_offset;
-    gemm_kernel_info.b_offset                = b_offset;
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-    {
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
-        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
-        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
-        gemmlowp_output_stage.output_data_type        = a->data_type();
-
-        gemm_kernel_info.output_stage = gemmlowp_output_stage;
-        if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
-        }
-        else
-        {
-            TensorInfo mm_result_s32_info{};
-
-            if(reshape_matrix_b)
-            {
-                // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
-
-                // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
-            }
-            else
-            {
-                // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
-
-                // Pick up the GEMM configuration
-                // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-                // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-                lhs_info       = res.lhs_info;
-                rhs_info       = res.rhs_info;
-
-                // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
-            }
-
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                output,
-                                                                                                a_offset, b_offset,
-                                                                                                gemmlowp_output_stage,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
-        }
-    }
-    else
-    {
-        if(reshape_matrix_b)
-        {
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
-        }
-        else
-        {
-            // Pick up the GEMM configuration
-            // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-            lhs_info       = res.lhs_info;
-            rhs_info       = res.rhs_info;
-
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
-        }
-
-        if(output->total_size() != 0)
-        {
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     c,
-                                                                                     a_offset, b_offset));
-        }
-    }
-
-    return Status{};
-}
-
-void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
-{
-    const ITensor *a   = tensors.get_const_tensor(ACL_SRC_0);
-    const ITensor *b   = tensors.get_const_tensor(ACL_SRC_1);
-    const ITensor *c   = tensors.get_const_tensor(ACL_SRC_2);
-    ITensor       *dst = tensors.get_tensor(ACL_DST);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst);
-
-    CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
-    CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
-    CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
-    CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
-    CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
-    CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
-    CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
-
-    // Prepare the consts if needed
-    prepare(tensors);
-
-    const ITensor *matrix_a = a;
-    const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
-
-    if(_is_gemm_reshaped)
-    {
-        matrix_b = tmp_b.get();
-        if(!_reshape_b_only_on_first_run)
-        {
-            // Run reshape matrix B
-            ITensorPack mtx_b_reshape_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
-            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
-        }
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-        ITensorPack mtx_b_red_pack =
-        {
-            { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-            { TensorType::ACL_DST, vec_sum_col.get() }
-        };
-        CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
-    }
-
-    // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
-    {
-        ITensorPack mtx_a_red_pack =
-        {
-            { TensorType::ACL_SRC, matrix_a },
-            { TensorType::ACL_DST, vec_sum_row.get() }
-        };
-        CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
-    }
-
-    // Run matrix multiply
-    if(_is_gemm_reshaped)
-    {
-        ITensorPack gemm_reshaped_pack;
-        if(_run_offset_contribution)
-        {
-            gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
-            });
-        }
-        else
-        {
-            gemm_reshaped_pack = ITensorPack(
-            {
-                { TensorType::ACL_SRC, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_BIAS, c },
-                { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-                { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-                { TensorType::ACL_SHIFTS, shifts.get() },
-                { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-                { TensorType::ACL_DST, dst },
-            });
-        }
-        CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
-    }
-    else
-    {
-        ITensorPack gemm_native_pack =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b },
-            { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
-        };
-        CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
-    }
-    if(_run_output_stage)
-    {
-        // Run offset contribution/output stage kernel
-        ITensorPack output_stage_pack =
-        {
-            { TensorType::ACL_SRC, res32.get() },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-            { TensorType::ACL_SHIFTS, shifts.get() },
-            { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-            { TensorType::ACL_DST, dst },
-        };
-        CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
-    }
-    if(_run_offset_contribution)
-    {
-        // Run offset contribution kernel
-        ITensorPack offset_contrib_pack =
-        {
-            { TensorType::ACL_SRC_DST, dst },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
-        };
-        CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
-    }
-}
-
-void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
-        CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
-        CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
-
-        ARM_COMPUTE_ERROR_ON_NULLPTR(b);
-
-        if(_convert_to_qasymm8)
-        {
-            ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
-            CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
-            b->mark_as_unused();
-        }
-
-        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
-        {
-            // Run reshape kernel and mark original weights tensor as unused
-            ITensorPack mtx_b_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
-            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
-            b->mark_as_unused();
-        }
-
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
-        {
-            ITensorPack mtx_b_red_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, vec_sum_col.get() }
-            };
-            CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
-        }
-
-        // Compute GEMM output multipliers and shifts for output stage
-        {
-            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
-            CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
-            CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
-
-            ICLTensor *multiplier_tensor = multipliers.get();
-            if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
-            {
-                multiplier_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
-                multiplier_tensor->unmap(CLScheduler::get().queue());
-            }
-
-            ICLTensor *shifts_tensor = shifts.get();
-            if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
-            {
-                shifts_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
-                shifts_tensor->unmap(CLScheduler::get().queue());
-            }
-        }
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
deleted file mode 100644
index 36a4257b86..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
-#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLTypes.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-// Forward declarations
-class ClCastKernel;
-class ClGemmLowpMatrixMultiplyNativeKernel;
-class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
-class ClGemmReshapeRhsMatrixKernel;
-class ClGemmLowpMatrixAReductionKernel;
-class ClGemmLowpMatrixBReductionKernel;
-class ClGemmLowpOffsetContributionKernel;
-class ClGemmLowpOffsetContributionOutputStageKernel;
-} // namespace kernels
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
-class ClGemmLowpMatrixMultiplyCore : public IClOperator
-{
-public:
-    ClGemmLowpMatrixMultiplyCore();
-    ~ClGemmLowpMatrixMultiplyCore();
-    /** Initialise the kernel's inputs, output
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1               |src2     |dst            |
-     * |:--------------|:------------------|:--------|:--------------|
-     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
-     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
-     * |QASYMM8        |QASYMM8            |S32      |S32            |
-     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
-     * |QASYMM8        |QSYMM8             |S32      |S32            |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
-     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
-     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
-     *
-     * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
-     *  This kernel performs the following computations:
-     *
-     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
-     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
-     *  -# Compute the matrix product of the resulting a * b in int32.
-     *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  a               First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[out] output          Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
-     * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should be executed only for the first run
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClGemmLowpMatrixMultiplyCore::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum AuxTensorIdx
-    {
-        ResultS32 = 0,
-        RhsQAsymm8,
-        RhsReshape,
-        VecSumCol,
-        VecSumRow,
-        Multipliers,
-        Shifts,
-        Count
-    };
-
-private:
-    // Kernels used
-    std::unique_ptr<kernels::ClCastKernel>                                  _weights_to_qasymm8;
-    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel>          _mm_native_kernel;
-    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                  _mtx_b_reshape_kernel;
-    std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-
-    // Temporary tensors
-    TensorInfo _qasymm8_weights{};
-    TensorInfo _vector_sum_col{};
-    TensorInfo _vector_sum_row{};
-    TensorInfo _tmp_b{};
-    TensorInfo _mm_result_s32{};
-    TensorInfo _gemm_output_stage_multipliers{};
-    TensorInfo _gemm_output_stage_shifts{};
-
-    int32_t  _a_offset{ 0 };
-    int32_t  _b_offset{ 0 };
-    bool     _is_gemm_reshaped{ true };
-    bool     _reshape_b_only_on_first_run{ false };
-    bool     _run_output_stage{ false };
-    bool     _convert_to_qasymm8{ false };
-    bool     _run_offset_contribution{ false };
-    bool     _is_prepared{ false };
-    GEMMInfo _gemm_info{};
-
-    experimental::MemoryRequirements _aux_mem{};
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.cpp
deleted file mode 100644
index 3477583c76..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>();
-            k->configure(compile_context, src, bias, dst, &info);
-            _kernel = std::move(k);
-            break;
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-        {
-            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>();
-            k->configure(compile_context, src, bias, dst, &info);
-            _kernel = std::move(k);
-            break;
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
-        {
-            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>();
-            k->configure(compile_context, src, bias, dst, &info);
-            _kernel = std::move(k);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
-    }
-}
-
-Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
-            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(src, bias, dst, &info);
-        default:
-            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
-    }
-}
-
-void ClGemmLowpOutputStage::run(ITensorPack &tensors)
-{
-    const ITensor *src  = tensors.get_const_tensor(ACL_SRC);
-    const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
-    ITensor       *dst  = tensors.get_tensor(ACL_DST);
-
-    ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } };
-    CLScheduler::get().enqueue_op(*_kernel, pack, true);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.h
deleted file mode 100644
index 33b82fcafa..0000000000
--- a/src/runtime/gpu/cl/operators/ClGemmLowpOutputStage.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
-#define ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-/** This file contains all available output stages for GEMMLowp on OpenCL.
- *
- *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
- *  and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- *
- *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
- */
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
- *
- *  This function calls the following CL kernels:
- *
- * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
- * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
- * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
-*/
-class ClGemmLowpOutputStage : public IClOperator
-{
-public:
-    /** Constructor */
-    ClGemmLowpOutputStage() = default;
-    /** Initialise the kernel's inputs, output
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src0           |src1          |dst           |
-     * |:--------------|:-------------|:-------------|
-     * |S32            |S32           |QASYMM8       |
-     * |S32            |S32           |QASYMM8_SIGNED|
-     * |S32            |S32           |QSYMM16       |
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
-     * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClGemmLowpOutputStage::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/runtime/gpu/cl/operators/ClLogicalNot.cpp b/src/runtime/gpu/cl/operators/ClLogicalNot.cpp
deleted file mode 100644
index 400efe450d..0000000000
--- a/src/runtime/gpu/cl/operators/ClLogicalNot.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClLogicalNot.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
-    k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT);
-    _kernel = std::move(k);
-}
-
-Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClLogicalNot.h b/src/runtime/gpu/cl/operators/ClLogicalNot.h
deleted file mode 100644
index 782ac0848f..0000000000
--- a/src/runtime/gpu/cl/operators/ClLogicalNot.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H
-#define ARM_COMPUTE_CL_LOGICAL_NOT_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */
-class ClLogicalNot : public IClOperator
-{
-public:
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: U8.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClLogicalNot::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */
diff --git a/src/runtime/gpu/cl/operators/ClMul.cpp b/src/runtime/gpu/cl/operators/ClMul.cpp
deleted file mode 100644
index d1e2bc806f..0000000000
--- a/src/runtime/gpu/cl/operators/ClMul.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClMul.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClMulKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                      ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClMulKernel>();
-    k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                       ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
-}
-
-void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClComplexMulKernel>();
-    k->configure(compile_context, src1, src2, dst, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClMul.h b/src/runtime/gpu/cl/operators/ClMul.h
deleted file mode 100644
index 29d5885a1c..0000000000
--- a/src/runtime/gpu/cl/operators/ClMul.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_MUL_H
-#define ARM_COMPUTE_CL_MUL_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref opencl::kernels::ClMulKernel */
-class ClMul : public IClOperator
-{
-public:
-    /** Initialise the kernel's sources, dst and convertion policy.
-     *
-     * Valid configurations (src1,src2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] src1            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst             The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]      scale           Scale to apply after multiplication.
-     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClMul::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
-class ClComplexMul : public IClOperator
-{
-public:
-    /** Initialise the kernel's sources, dst.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] src1            An src tensor info. Data types supported: F16/F32. Number of channels supported: 2.
-     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2            An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClComplexMul::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_MUL_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPRelu.cpp b/src/runtime/gpu/cl/operators/ClPRelu.cpp
deleted file mode 100644
index d1ce14cc87..0000000000
--- a/src/runtime/gpu/cl/operators/ClPRelu.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPRelu.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using KernelType = kernels::ClArithmeticKernel;
-void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
-{
-    auto k = std::make_unique<KernelType>();
-    k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
-    _kernel = std::move(k);
-}
-
-Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
-    return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
-}
-
-void ClPRelu::run(ITensorPack &tensors)
-{
-    // Output tensor can be given as nullptr for in-place computation.
-    // In this case, get the input tensor and use it as the output tensor.
-    if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
-    {
-        auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-        ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
-        tensors.add_tensor(TensorType::ACL_DST, src_tensor);
-    }
-    IClOperator::run(tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPRelu.h b/src/runtime/gpu/cl/operators/ClPRelu.h
deleted file mode 100644
index 3a02030635..0000000000
--- a/src/runtime/gpu/cl/operators/ClPRelu.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_PRELU_H
-#define ARM_COMPUTE_CL_PRELU_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU
- *
- * @note The operator implements an activation layer with the PRELU activation function.
- */
-class ClPRelu : public IClOperator
-{
-public:
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClPRelu::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PRELU_H */
diff --git a/src/runtime/gpu/cl/operators/ClPermute.cpp b/src/runtime/gpu/cl/operators/ClPermute.cpp
deleted file mode 100644
index 719bb6dac6..0000000000
--- a/src/runtime/gpu/cl/operators/ClPermute.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPermuteKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
-    auto k = std::make_unique<kernels::ClPermuteKernel>();
-    k->configure(compile_context, src, dst, perm);
-    _kernel = std::move(k);
-}
-
-Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    return kernels::ClPermuteKernel::validate(src, dst, perm);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPermute.h b/src/runtime/gpu/cl/operators/ClPermute.h
deleted file mode 100644
index 867aba010d..0000000000
--- a/src/runtime/gpu/cl/operators/ClPermute.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_PERMUTE_H
-#define ARM_COMPUTE_CL_PERMUTE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClPermuteKernel */
-class ClPermute : public IClOperator
-{
-public:
-    /** Initialise the kernel's inputs and outputs and permute vector
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The src tensor info. Data types supported: All.
-     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
-     * @param[in] perm            Permutation vector
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClPermute::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.cpp b/src/runtime/gpu/cl/operators/ClPool2d.cpp
deleted file mode 100644
index 40c2b0a8ba..0000000000
--- a/src/runtime/gpu/cl/operators/ClPool2d.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClPool2d.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    // Configure pooling kernel
-    auto k = std::make_unique<kernels::ClPool2dKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, dst, info, indices);
-    _pooling = std::move(k);
-
-    const DataType data_type = src->data_type();
-
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode{};
-    PixelValue pixel_value(0.f);
-    if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
-    {
-        pixel_value = PixelValue(0, data_type, src->quantization_info());
-    }
-
-    // Data layout
-    const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-            border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            break;
-        case DataLayout::NHWC:
-            border_mode = BorderMode::CONSTANT;
-            if(PoolingType::MAX == info.pool_type)
-            {
-                if(is_data_type_quantized(data_type))
-                {
-                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
-                }
-                else
-                {
-                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
-                }
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
-    _border_handler = std::move(b);
-
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_pooling);
-}
-
-Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
-{
-    return kernels::ClPool2dKernel::validate(src, dst, info, indices);
-}
-
-void ClPool2d::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
-    CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClPool2d.h b/src/runtime/gpu/cl/operators/ClPool2d.h
deleted file mode 100644
index 8ac386a64b..0000000000
--- a/src/runtime/gpu/cl/operators/ClPool2d.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOL2D_H
-#define ARM_COMPUTE_CL_POOL2D_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref opencl::ClPool2d
- */
-class ClPool2d : public IClOperator
-{
-public:
-    /** Constructor */
-    ClPool2d() = default;
-    /** Configure operator for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]  info            Pooling layer parameters.
-     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClPool2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<ICLKernel> _pooling{ nullptr };
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/runtime/gpu/cl/operators/ClQuantize.cpp b/src/runtime/gpu/cl/operators/ClQuantize.cpp
deleted file mode 100644
index 92bbb62ba5..0000000000
--- a/src/runtime/gpu/cl/operators/ClQuantize.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClQuantize.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClQuantizeKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClQuantizeKernel::validate(src, dst);
-}
-
-void ClQuantize::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClQuantize.h b/src/runtime/gpu/cl/operators/ClQuantize.h
deleted file mode 100644
index b15d389cca..0000000000
--- a/src/runtime/gpu/cl/operators/ClQuantize.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_QUANTIZE_H
-#define ARM_COMPUTE_CL_QUANTIZE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */
-class ClQuantize : public IClOperator
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
-     * @param[out] dst             Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this function
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClQuantize::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_QUANTIZE_H */
diff --git a/src/runtime/gpu/cl/operators/ClReshape.cpp b/src/runtime/gpu/cl/operators/ClReshape.cpp
deleted file mode 100644
index d3fa9f10ab..0000000000
--- a/src/runtime/gpu/cl/operators/ClReshape.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClReshape.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClReshapeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClReshapeKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClReshapeKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClReshape.h b/src/runtime/gpu/cl/operators/ClReshape.h
deleted file mode 100644
index b3d9267be4..0000000000
--- a/src/runtime/gpu/cl/operators/ClReshape.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_RESHAPE_H
-#define ARM_COMPUTE_CL_RESHAPE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClReshapeKernel */
-class ClReshape : public IClOperator
-{
-public:
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor info. Data type supported: All
-     * @param[out] output          Output info. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClReshape::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_RESHAPE_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClScale.cpp b/src/runtime/gpu/cl/operators/ClScale.cpp
deleted file mode 100644
index 5c8d754c7e..0000000000
--- a/src/runtime/gpu/cl/operators/ClScale.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClScale.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClScaleKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    // Configure Scale kernel
-    auto k = std::make_unique<kernels::ClScaleKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, src, dst, info);
-    _kernel = std::move(k);
-
-    // Tune kernel
-    CLScheduler::get().tune_kernel_static(*_kernel);
-}
-
-Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    return kernels::ClScaleKernel::validate(src, dst, info);
-}
-
-void ClScale::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClScale.h b/src/runtime/gpu/cl/operators/ClScale.h
deleted file mode 100644
index 0ff78640f7..0000000000
--- a/src/runtime/gpu/cl/operators/ClScale.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SCALE_H
-#define ARM_COMPUTE_CL_SCALE_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
- *
- * -# @ref kernels::ClScaleKernel
- */
-class ClScale : public IClOperator
-{
-public:
-    /** Constructor */
-    ClScale() = default;
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] src             Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    dst             Destination tensor info. Data types supported: Same as @p src
-     *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClScale::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSCALE_H */
diff --git a/src/runtime/gpu/cl/operators/ClSoftmax.cpp b/src/runtime/gpu/cl/operators/ClSoftmax.cpp
deleted file mode 100644
index 975bb0b932..0000000000
--- a/src/runtime/gpu/cl/operators/ClSoftmax.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClSoftmax.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/gpu/cl/operators/ClPermute.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace opencl
-{
-ClSoftmax::ClSoftmax()
-    : _permute_input(std::make_unique<ClPermute>()),
-      _permute_output(std::make_unique<ClPermute>()),
-      _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()),
-      _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()),
-      _max_info(),
-      _sum_info(),
-      _tmp_info(),
-      _permuted_src_info(),
-      _permuted_dst_info(),
-      _aux_mem(InternalTensorIdx::COUNT)
-{
-}
-
-void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
-
-    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-
-    _needs_permute = actual_axis != 0;
-
-    const ITensorInfo &tmp_input_info  = _needs_permute ? _permuted_src_info : src;
-    ITensorInfo       &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
-
-    if(_needs_permute)
-    {
-        const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
-    }
-
-    DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
-    _tmp_info              = tmp_input_info.clone()->set_data_type(tmp_data_type);
-
-    TensorShape max_sum_shape = tmp_input_info.tensor_shape();
-    _max_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
-    _sum_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type);
-
-    // Set GPU target to kernels
-    _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
-
-    _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
-    _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
-
-    if(_needs_permute)
-    {
-        const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
-    }
-
-    _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
-
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
-}
-
-Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_UNUSED(info.beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
-
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-    const bool   needs_permute = actual_axis != 0;
-    if(needs_permute)
-    {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
-        TensorInfo              input_permuted(src.clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
-        TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
-    }
-
-    // Create intermediate tensor info
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type();
-    TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src.tensor_shape();
-    max_sum_shape.set(0, 1);
-    TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
-
-    return Status{};
-}
-
-void ClSoftmax::run(ITensorPack &tensors)
-{
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false);
-    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
-    CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
-
-    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
-    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
-
-    if(_needs_permute)
-    {
-        ITensorPack pack;
-        pack.add_const_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, permuted_src.get());
-        _permute_input.get()->run(pack);
-    }
-
-    ITensorPack sum_pack;
-    ITensorPack norm_pack;
-    if(_needs_permute)
-    {
-        sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
-        norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
-    }
-    else
-    {
-        sum_pack.add_const_tensor(TensorType::ACL_SRC, src);
-        norm_pack.add_tensor(TensorType::ACL_DST, dst);
-    }
-    sum_pack.add_tensor(TensorType::ACL_DST, tmp.get());
-    sum_pack.add_tensor(TensorType::ACL_INT_0, max.get());
-    sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get());
-
-    norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get());
-    norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get());
-
-    CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
-    CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
-
-    if(_needs_permute)
-    {
-        ITensorPack pack;
-        pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output.get()->run(pack);
-    }
-}
-
-experimental::MemoryRequirements ClSoftmax::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClSoftmax.h b/src/runtime/gpu/cl/operators/ClSoftmax.h
deleted file mode 100644
index c85b193d9d..0000000000
--- a/src/runtime/gpu/cl/operators/ClSoftmax.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_H
-#define ARM_COMPUTE_CL_SOFTMAX_H
-
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-struct SoftmaxKernelInfo;
-
-namespace opencl
-{
-class ClPermute;
-namespace kernels
-{
-class ClLogits1DMaxShiftExpSumKernel;
-class ClLogits1DNormKernel;
-} // namespace kernels
-class ClSoftmax : public IClOperator
-{
-public:
-    /** Constructor */
-    ClSoftmax();
-    /** Configure the operator
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src
-     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClSoftmax::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    enum InternalTensorIdx
-    {
-        MAX = 0,
-        SUM,
-        TMP,
-        PERMUTED_SRC,
-        PERMUTED_DST,
-        COUNT
-    };
-
-    std::unique_ptr<ClPermute>                               _permute_input;
-    std::unique_ptr<ClPermute>                               _permute_output;
-    std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
-    std::unique_ptr<kernels::ClLogits1DNormKernel>           _norm_kernel;
-    bool                                                     _needs_permute{ false };
-
-    TensorInfo _max_info;
-    TensorInfo _sum_info;
-    TensorInfo _tmp_info;
-    TensorInfo _permuted_src_info;
-    TensorInfo _permuted_dst_info;
-
-    experimental::MemoryRequirements _aux_mem{};
-};
-
-} // opencl
-} // arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClSub.cpp b/src/runtime/gpu/cl/operators/ClSub.cpp
deleted file mode 100644
index 429f23a837..0000000000
--- a/src/runtime/gpu/cl/operators/ClSub.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClSub.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
-    k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
-    _kernel = std::move(k);
-}
-
-Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/runtime/gpu/cl/operators/ClSub.h b/src/runtime/gpu/cl/operators/ClSub.h
deleted file mode 100644
index 2dac11c00e..0000000000
--- a/src/runtime/gpu/cl/operators/ClSub.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SUB_H
-#define ARM_COMPUTE_CL_SUB_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run arithmetic subtraction
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
- * @note The function performs an arithmetic subtraction between two tensors.
- */
-class ClSub : public IClOperator
-{
-public:
-    /** Configure function for a given list of arguments.
-     *
-     * Valid configurations (src1,src2) -> dst :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
-     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2            Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
-     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst             Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
-     * @param[in]      policy          Policy to use to handle overflow.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClSub::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SUB_H */
diff --git a/src/runtime/gpu/cl/operators/ClTranspose.cpp b/src/runtime/gpu/cl/operators/ClTranspose.cpp
deleted file mode 100644
index 48f44282e8..0000000000
--- a/src/runtime/gpu/cl/operators/ClTranspose.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClTranspose.h"
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    auto k = std::make_unique<kernels::ClTransposeKernel>();
-    k->configure(compile_context, src, dst);
-    _kernel = std::move(k);
-}
-
-Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    return kernels::ClTransposeKernel::validate(src, dst);
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClTranspose.h b/src/runtime/gpu/cl/operators/ClTranspose.h
deleted file mode 100644
index dcd80820bb..0000000000
--- a/src/runtime/gpu/cl/operators/ClTranspose.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_TRANSPOSE_H
-#define ARM_COMPUTE_CL_TRANSPOSE_H
-
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** Basic function to run @ref kernels::ClTransposeKernel */
-class ClTranspose : public IClOperator
-{
-public:
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The src tensor info. Data types supported: All.
-     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClTranspose::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp b/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
deleted file mode 100644
index 07f90ddaef..0000000000
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/gpu/cl/operators/ClWinogradConv2d.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
-#include "src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
-#include "src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
-
-using namespace arm_compute::experimental;
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace
-{
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
-{
-    Size2D output_tile = Size2D{};
-
-    const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
-
-    // Check if the input spatial dimensions are smaller than 4
-    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
-
-    if(kernel_max_dim == 3U)
-    {
-        if(kernel_dims == Size2D(3U, 3U))
-        {
-            output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
-        }
-        else if(kernel_dims == Size2D(3U, 1U))
-        {
-            output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
-        }
-        else
-        {
-            output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
-        }
-    }
-    else if(kernel_max_dim == 5U)
-    {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
-                             kernel_dims.height == 1 ? 1U : 4U);
-    }
-    else if(kernel_max_dim == 7U)
-    {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
-                             kernel_dims.height == 1 ? 1U : 2U);
-    }
-
-    return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
-{
-    // Check if we want to configure a Winograd configuration which requires fast math
-    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    std::vector<WinogradConfiguration> fast_math_winograd =
-    {
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
-    };
-
-    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
-                            std::pair<int, int>(kernel_size.width, kernel_size.height));
-
-    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                          const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
-    // Get indeces for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-
-    // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
-    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
-    }
-
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
-
-    // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-    const TensorInfo  input0       = src->clone()->set_tensor_shape(input0_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
-
-    // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
-
-    // Validate batched matrix multiply
-    TensorShape batched_mm_output_shape = input0.tensor_shape();
-    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
-    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                     GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
-
-    // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
-    return Status{};
-}
-
-} // namespace
-
-ClWinogradConv2d::ClWinogradConv2d()
-    : _batched_mm(),
-      _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()),
-      _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()),
-      _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()),
-      _border_handler(),
-      _input0(),
-      _input1(),
-      _batched_mm_output(),
-      _is_prepared(false),
-      _aux_mem()
-{
-}
-
-ClWinogradConv2d::~ClWinogradConv2d() = default;
-
-void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
-    // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-
-    // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
-    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
-    }
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
-
-    _is_prepared = false;
-
-    // Configure input transform
-    _input_transform->configure(compile_context, src, &_input0, winograd_info);
-    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
-
-    // Configure filter transform
-    _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
-
-    // Configure batched matrix multiply
-    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
-                                                                                                                  false, false,
-                                                                                                                  GEMMLowpOutputStageInfo(),
-                                                                                                                  (src->data_type() == DataType::F16)));
-
-    // Configure output transform
-    _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
-
-    _aux_mem                             = _batched_mm.workspace();
-    const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
-    {
-        return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
-    }) ?
-    MemoryLifetime::Prepare :
-    MemoryLifetime::Persistent;
-    _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
-    _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
-    _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
-}
-
-Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
-    return Status{};
-}
-
-void ClWinogradConv2d::run(ITensorPack &tensors)
-{
-    const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare;
-
-    auto src    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto dst    = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true);
-    CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped);
-    CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
-
-    prepare(tensors);
-
-    // Run input transform
-    ITensorPack pack_it
-    {
-        { TensorType::ACL_SRC, src },
-        { TensorType::ACL_DST, input0.get() },
-    };
-    CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
-    CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
-
-    // Run batched matrix multiplication
-    ITensorPack pack_mm = tensors;
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
-    pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
-    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
-    _batched_mm.run(pack_mm);
-
-    // Run output transform
-    ITensorPack pack_ot
-    {
-        { TensorType::ACL_SRC_0, batched_mm_output.get() },
-        { TensorType::ACL_SRC_1, biases },
-        { TensorType::ACL_DST, dst },
-    };
-    CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
-}
-
-void ClWinogradConv2d::prepare(ITensorPack &tensors)
-{
-    if(!_is_prepared)
-    {
-        auto       weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-        ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
-
-        CLAuxTensorHandler input1(_input1, *in1_aux);
-        ITensorPack        pack_ft
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, input1.get() },
-        };
-        // Run filter transform and mark original weights as unused
-        CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
-        weights->mark_as_unused();
-
-        // Prepare GEMM and release reshaped weights if marked unused by ClGemm
-        ITensorPack mm_prepare_pack = tensors;
-        mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get());
-        _batched_mm.prepare(mm_prepare_pack);
-
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
-    }
-}
-
-experimental::MemoryRequirements ClWinogradConv2d::workspace() const
-{
-    return _aux_mem;
-}
-} // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/gpu/cl/operators/ClWinogradConv2d.h b/src/runtime/gpu/cl/operators/ClWinogradConv2d.h
deleted file mode 100644
index 83b31f1c99..0000000000
--- a/src/runtime/gpu/cl/operators/ClWinogradConv2d.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H
-#define ARM_COMPUTE_CL_WINOGRADCONV2D_H
-
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/runtime/gpu/cl/IClOperator.h"
-#include "src/runtime/gpu/cl/operators/ClGemm.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ITensorInfo;
-namespace opencl
-{
-namespace kernels
-{
-class ClWinogradInputTransformKernel;
-class ClWinogradFilterTransformKernel;
-class ClWinogradOutputTransformKernel;
-} // kernels
-/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
- *
- *  -# @ref kernels::ClWinogradInputTransformKernel
- *  -# @ref kernels::ClWinogradFilterTransformKernel (only once)
- *  -# @ref ClGemm
- *  -# @ref kernels::ClWinogradOutputTransformKernel
- *
- */
-class ClWinogradConv2d : public IClOperator
-{
-public:
-    /** Default constructor */
-    ClWinogradConv2d();
-    /** Default destructor */
-    ~ClWinogradConv2d();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ClWinogradConv2d(const ClWinogradConv2d &) = delete;
-    /** Default move constructor */
-    ClWinogradConv2d(ClWinogradConv2d &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete;
-    /** Default move assignment operator */
-    ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default;
-    /** Set the input and output tensors.
-     *
-     * Valid data layouts:
-     * - NHWC
-     * - NCHW
-     *
-     * Valid data type configurations:
-     * |src0           |src1           |src2   |dst            |
-     * |:--------------|:--------------|:------|:--------------|
-     * |F16            |F16            |F16    |F16            |
-     * |F32            |F32            |F32    |F32            |
-     *
-     * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
-     * @note  Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                              while every optional dimension from 4 and above represent a batch of inputs.
-     *                              Data types supported: F16/F32.
-     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
-     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src
-     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                              Data types supported: Same as @p src.
-     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradConv2d::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
-    experimental::MemoryRequirements workspace() const override;
-
-private:
-    ClGemm                                                    _batched_mm;
-    std::unique_ptr<kernels::ClWinogradInputTransformKernel>  _input_transform;
-    std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform;
-    std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform;
-    CLFillBorderKernel                                        _border_handler;
-    TensorInfo                                                _input0;
-    TensorInfo                                                _input1;
-    TensorInfo                                                _batched_mm_output;
-    bool                                                      _is_prepared;
-    experimental::MemoryRequirements                          _aux_mem{};
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */
diff --git a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h b/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
deleted file mode 100644
index af383489a1..0000000000
--- a/src/runtime/gpu/cl/utils/ClAuxTensorHandler.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H
-
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-
-#include "src/common/utils/Log.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-/* Tensor handler to wrap and handle tensor allocations on workspace buffers */
-class CLAuxTensorHandler
-{
-public:
-    CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
-        : _tensor()
-    {
-        if(info.total_size() == 0)
-        {
-            return;
-        }
-        _tensor.allocator()->soft_init(info);
-
-        ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
-        {
-            if(!bypass_alloc)
-            {
-                _tensor.allocator()->allocate();
-                ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
-            }
-
-            if(pack_inject)
-            {
-                pack.add_tensor(slot_id, &_tensor);
-                _injected_tensor_pack = &pack;
-                _injected_slot_id     = slot_id;
-            }
-        }
-        else
-        {
-            _tensor.allocator()->import_memory(packed_tensor->cl_buffer());
-        }
-    }
-
-    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor)
-        : _tensor()
-    {
-        _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
-        {
-            _tensor.allocator()->import_memory(tensor.cl_buffer());
-        }
-    }
-
-    CLAuxTensorHandler(const CLAuxTensorHandler &) = delete;
-    CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
-
-    ~CLAuxTensorHandler()
-    {
-        if(_injected_tensor_pack)
-        {
-            _injected_tensor_pack->remove_tensor(_injected_slot_id);
-        }
-    }
-
-    ICLTensor *get()
-    {
-        return &_tensor;
-    }
-
-    ICLTensor *operator()()
-    {
-        return &_tensor;
-    }
-
-private:
-    CLTensor     _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
-\ No newline at end of file
author	Georgios Pinitas <georgios.pinitas@arm.com>	2021-08-20 21:39:25 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-08-25 16:23:15 +0000
commit	7891a73ef36f4ad7b71069b3c57694f85bb79454 (patch)
tree	5b08692989e28ce63de2937d8d92ea5176589dbe /src/runtime
parent	a46c9c98c2b1d70acc7c6eee00e2cdc2a1e209a6 (diff)
download	ComputeLibrary-7891a73ef36f4ad7b71069b3c57694f85bb79454.tar.gz